In [None]:
# install packages
# pip install factor_analyzer
#Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from factor_analyzer import FactorAnalyzer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Loading the Student dataset
# Load the student data of portugese and mathematics data to analyse

In [None]:
#read the csv files as pandas dataframe
mt = pd.read_csv('student-mat.csv', sep=';')
por = pd.read_csv('student-por.csv', sep=';')

# add column subject in both dataset

In [None]:
mt['subject'] = 'math'
por['subject'] = 'por'

### Merge datasets

In [None]:
df = pd.concat([mt,por])

### Rename column labels

In [None]:
df.columns = ['school','sex','age','address','family_size','parents_status','mother_edu','father_edu',

'mother_job','father_job','reason','guardian','commute_time','study_time','failures','school_support',

'family_support','paid_classes','activities','nursery','desire_higher_edu','internet','romantic','family_quality',

'free_time','go_out','weekday_alcohol_usage','weekend_alcohol_usage','health','absences','period_one_score','period_two_score','final_score', 'subject']

### Convert final_score to categorical variable # Good:15-20 Fair:10-14 Poor:0-9

In [None]:
df['final_grade'] = 'na'
df.loc[(df.final_score >= 15) & (df.final_score <= 20), 'final_grade'] = 'well'
df.loc[(df.final_score >= 10) & (df.final_score <= 14), 'final_grade'] = 'average'
df.loc[(df.final_score >= 0) & (df.final_score <= 9), 'final_grade'] = 'poor'
df.tail(5)

### Length of the data

In [None]:
data_length = len(df)
print('Data length is: ',data_length)

### Number of variable

In [None]:
features = len(df.columns)
print('Number of features: ', features)

### Data type information

In [None]:
df.dtypes

### Check for missing values

In [None]:
df.isnull().sum()

### Analysis of Age vs Absences

In [None]:
plt.bar(df.final_score, df.absences)
plt.xlabel('Age')
plt.ylabel('Absences')
plt.title('Age vs Absences of the students')
plt.show()

### Histogram of father job

In [None]:
df['father_job'].hist()
plt.title('Histogram of father job')
plt.xlabel('Father Job')
plt.ylabel('Distribution')
plt.show()

### Subjects vs Age

In [None]:
sns.countplot(x='age',hue='subject',data=df)
plt.title('Comaprision of age in subjects')
plt.show()

### Student percentage living area by parent status

In [None]:
perc = (lambda col: col/col.sum())
index = ['A','T']
ad_tab1 = pd.crosstab(index=df.parents_status, columns=df.address)
ad_tab = np.log(ad_tab1)
ad_perc = ad_tab.apply(perc).reindex(index)
ad_perc.plot.bar(colormap="RdYlGn_r", fontsize=16, figsize=(8,4))
plt.title('Student percentage living area by parent status', fontsize=20)
plt.ylabel('Percentage of Student', fontsize=16)
plt.xlabel('Status', fontsize=16)
# Box plot for the final score
plt.figure(figsize=(6, 4))
sns.boxplot(x='final_score', data=df)
plt.title('Final score Distribution')
plt.show()

### Handling categorial Variables

In [None]:
df['sex'] = df.sex.apply(lambda x: 0 if x=='M' else 1)
df['address'] = df.address.apply(lambda x: 0 if x=='U' else 1)
df['subject'] = df.subject.apply(lambda x: 0 if x=='math' else 1)
encode_final_grade = {'poor':0,'average':1, 'well':2}
df['final_grade'] = df.final_grade.apply(lambda x: encode_final_grade[x])
X = df.select_dtypes(exclude=['object'])
# Initialize FactorAnalyzer with desired number of factors
n_factors = 5  # Number of factors to extract
fa = FactorAnalyzer(n_factors, rotation='varimax')
# Fit the factor analysis model
fa.fit(X)

In [None]:
# Obtain factor loadings and eigenvalues
loadings = fa.loadings_
eigenvalues, _ = fa.get_eigenvalues()
# Scree plot to visualize eigenvalues and decide on the number of factors
plt.figure(figsize=(8, 5))
plt.scatter(range(1, len(eigenvalues) + 1), eigenvalues)
plt.plot(range(1, len(eigenvalues) + 1), eigenvalues, marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
factor_loadings = pd.DataFrame(loadings, columns=['Factor ' + str(i) for i in range(1, n_factors + 1)],
index=X.columns)
print("Factor Loadings:")
print(factor_loadings)

#### Display factor loadings

In [None]:
factor_loadings = pd.DataFrame(loadings, columns=['Factor ' + str(i) for i in range(1, n_factors + 1)],
index=X.columns)
print("Factor Loadings:")
print(factor_loadings)

### Regression

In [None]:
# Get the data and target variable
df1 = df.drop('final_grade', axis=1)
X = df1.select_dtypes(exclude=['object'])
# X = df.select_dtypes(exclude=['object']).values
y = df['final_score']
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Initializing the Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
# Training the model
rf_model.fit(X_train, y_train)
# Predicting on the test set
predictions = rf_model.predict(X_test)
# Evaluation metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"RMSE : {r2}")

### Extracting feature importances

In [None]:
feature_importances = rf_model.feature_importances_
feature_names = X.columns
# Sorting feature importances in descending order
sorted_indices = feature_importances.argsort()[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = feature_names[sorted_indices]
# Creating a bar plot for feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_feature_importances)), sorted_feature_importances, align='center')
plt.xticks(range(len(sorted_feature_importances)), sorted_feature_names, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importances')
plt.tight_layout()
plt.show()

### Classification

In [None]:
y = df.pop('final_grade').values
X = df.select_dtypes(exclude=['object']).values
# splitting the data for training the model:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 41)
# random forest classifier:
rf_clf = RandomForestClassifier(n_estimators=100, random_state=41)
#train the model on the training set:
rf_clf.fit(X_train, y_train)
#get prediction:
y_pred = rf_clf.predict(X_test)
# evaluation:
rf_acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {rf_acc:.2f}")
print("Accurecy:", str(int(rf_acc*100))+'%')

In [None]:
import os, json
import matplotlib.pyplot as plt

# Example metrics dict
metrics = {"mse": 0.0106, "mae": 0.0114, "r2": 0.9993}

os.makedirs("artifacts", exist_ok=True)

# Save metrics
with open("artifacts/metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

# Example plot
plt.figure()
plt.bar(metrics.keys(), metrics.values())
plt.title("Model Performance")
plt.savefig("artifacts/performance.png")
plt.close()