In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
#url of the dataset
url="https://raw.githubusercontent.com/FlipRoboTechnologies/ML-Datasets/main/Grades/Grades.csv"
df=pd.read_csv(url)
df

# Exploratory Data Analysis (EDA)

In [None]:
df.shape

The Dataset contain 571 rows and 43 columns

In [None]:
df.columns

In [None]:
#checking the datatypes of columns
df.dtypes

Here only 1 numerical columns are present as float64 and 42 object or string in dataset

In [None]:
df.head()

In [None]:
df.tail(15)

Here we can find few NaN value present in table like 565 rows and many entry as NaN

# np.nan,None,NaN,others

In [None]:
#Checking the null value in the dataset
df.isnull().sum()

In [None]:
# checking the total value present in dataset 
df.isnull().sum().sum()

In [None]:
df.info()

In [None]:
#lets visualize it using heatmap
sns.heatmap(df.isnull())

As we can find 425 null values and limited data set of 571 rows we cannot drop this much of data lets check 

In [None]:
df

In [None]:
rows_to_drop=[44,60,91,137,142,143,281,282,288,565]

df.drop(rows_to_drop,inplace = True)
df.shape

In [None]:
# checking the unique value 
df["CGPA"].unique()

In [None]:
# checking the unique value count
df["CGPA"].nunique()

In [None]:
#Checking the value count of each variabe in the table

for i in df:
    print(df[i].value_counts())
    print("\n")

Can also drop Seat No. column as it not such importance to the CGPA with grades and also has same unique count as total no. of entry

In [None]:
df.drop("Seat No.",axis=1,inplace=True)

After lets fill the Missing value with simple imputer

# Feature Engineering 

In [None]:
from sklearn.impute import SimpleImputer
imp=SimpleImputer(missing_values=np.nan,strategy="most_frequent")

for i in df:
    df[i]=imp.fit_transform(df[i].values.reshape(-1,1))
    
df

In [None]:
df.isnull().sum().sum()

In [None]:
#checking number of unique values in Glass dataset
df.nunique().to_frame("No. of unique value")

In [None]:
df.dtypes

As the problem of this data set is to predict CGPA after scoring grade in particular subject as the problem solution is in numerical data float 64 which require Regression model to predict CGPA query so to convert object into numerical data using label encoder 

In [None]:
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()

for i in df:
    if df[i].dtypes=="object":
        df[i]=LE.fit_transform(df[i].values.reshape(-1,1))
        
        
df

# Description of Dataset

In [None]:
# Statistical summary of numerical columns
df.describe()

We can find min ,max ,IQR mean median stastistical summary of Target column as other features are generally classification as data specified is Grades

In [None]:
plt.figure(figsize=(22,10))
sns.heatmap(df.describe(),annot=True,fmt="0.2f",linewidth=0.2,linecolor="black",cmap="Spectral")
plt.xlabel("Features Names",fontsize=14)
plt.ylabel("Descriptive Stats",fontsize=14)
plt.title("Descriptive Graph",fontsize=20)
plt.show()

# Data Visualization

# Univariate Analysis

In [None]:
#Visualize the number of label target as 

sns.countplot(x='CGPA',data=df)


In [None]:
# Checking the count of each feature and target variable

for i in df:
    plt.figure(figsize=(12,3),facecolor="white")
    sns.countplot(x=i,data=df)
plt.show()

In [None]:
#Checking the data Distribution

# Lets check how the data has been distributed in numerical column

plt.figure(figsize=(35,30),facecolor="white")
plotnum=1
for col in df:
    if plotnum<=43:
        plt.subplot(9,5,plotnum)
        sns.distplot(df[col],color="m")
        plt.xlabel(col,fontsize = 10)
        plt.yticks(rotation=0, fontsize = 10)
    plotnum+=1
plt.show()

# Bivariate Analysis

In [None]:
for i in  df.columns:
    plt.figure(figsize=(4,2),facecolor="white")
    sns.lmplot(x=i ,y="CGPA", data=df, palette="colorblind")
    plt.xlabel(i)
    plt.ylabel("CGPA")
plt.show()

In [None]:
for i in  df.columns:
    plt.figure(figsize=(3,2),facecolor="white")
    sns.lmplot(x="CGPA" ,y=i, data=df, palette="colorblind")
    plt.xlabel("CGPA")
    plt.ylabel(i)
plt.show()

From the both plot we can observe as Grade decrease from F,D to A+ leads to increase in CGPA score or vice versa 

In [None]:
plt.figure(figsize=(20,25))
p=1
for i in df:
    if p<=42:
        plt.subplot(11,4,p)
        sns.regplot(x="CGPA",y=i, data=df,color="r")
        plt.xlabel("CGPA")
        plt.ylabel(i)
    p+=1
    
plt.show()

In [None]:
plt.figure(figsize=(20,25))
p=1
for i in df:
    if p<=42:
        plt.subplot(11,4,p)
        sns.scatterplot(x="CGPA",y=i, data=df,color="r")
        plt.xlabel("CGPA")
        plt.ylabel(i)
    p+=1
    
plt.show()

In [None]:
#Comparing CGPA Score  with other columns using barplot plot to find any relationship 
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=42:
        plt.subplot(11,4,p)
        sns.barplot(x=i,y="CGPA", data =df,color='b')
        plt.xlabel(i)
        plt.ylabel("CGPA Score")
    p+=1
plt.show()

Grade wise CGPA Score

In [None]:
#Comparing CGPA Score with other columns using histplot plot to find any relationship 
plt.figure(figsize=(20,25))
p=1
for i in df.columns:
    if p<=42:
        plt.subplot(11,4,p)
        sns.histplot(x=i,y="CGPA", data =df,color='m')
        plt.xlabel(i)
        plt.ylabel("CGPA score")
    p+=1
plt.show()

# Multivariate Analysis

In [None]:
#sns.pairplot(data=df)

# Checking for outliers

In [None]:
#lets check the outliers by plotting boxplot.

plt.figure(figsize=(10,15),facecolor="red")
plotnumber=1
for col in df.columns:
    if plotnumber<=43:
        plt.subplot(11,4,plotnumber)
        sns.boxplot(y=df[col], palette = "Set2_r")
        plt.xlabel(col, fontsize = 5)
        plt.yticks(rotation=2, fontsize =2)
    plotnumber+=1
    
plt.tight_layout()
plt.show()

In [None]:
#lets check the outliers by plotting boxplot.


plotnumber=1
for col in df.columns:
    if plotnumber<=43:
        plt.subplot(11,4,plotnumber)
        sns.boxplot(y=df[col], palette = "Set2_r")
        plt.xlabel(col, fontsize = 5)
        plt.yticks(rotation=2, fontsize =2)
    plotnumber+=1
    

plt.show()

In [None]:
df.describe()

In [None]:
# As we can seen there is outliers present in all columns except Mg

from scipy.stats import zscore
out_features=df.iloc[:,:]
z=np.abs(zscore(out_features))
z


In [None]:
#threshold z>3
np.where(z>3)

In [None]:
z.iloc[180,11]

In [None]:
z.iloc[560,36]

In [None]:
df1=df[(z<3).all(axis=1)]

In [None]:
df1.shape

In [None]:
print("Old Data Frame:",df.shape[0])
print("New Data Frame:",df1.shape[0])

In [None]:
print("The loss of data in percentage",((df.shape[0]-df1.shape[0])/df.shape[0])*100)

# Checking for Skewness

In [None]:
df1.skew()

In [None]:
# AS there is not much skewdness presents
#df1["CS-406"]= np.cbrt(df1["CS-406"])

In [None]:
plt.figure(figsize=(10,15),facecolor="red")
plotnumber=1

for column in df1:
    if plotnumber<=42:
        plt.subplot(11,4,plotnumber)
        sns.distplot(df1[column],color='green')
        plt.xlabel(column,fontsize=10)
    plotnumber+=1
plt.tight_layout()
plt.show()

# Correlation b/w target variable and independent variable

# Multicollinearity

In [None]:
cor=df1.corr()
cor

In [None]:
#Visualizing the correlation marix by using  heatmap

plt.figure(figsize=(26,14))
sns.heatmap(cor,annot=True,fmt="0.2f",linewidth = 0.2,linecolor="black", cmap="Spectral")
plt.xlabel("Figure",fontsize=14)
plt.ylabel("Features_Name", fontsize=14)
plt.title("Descriptive Graph", fontsize=20)
plt.show()

In [None]:
cor["CGPA"].sort_values(ascending=False)

# Visualizing the correlation between label and features using bar plot

In [None]:
#Visualizing the correlation between label and features using bar plot

plt.figure(figsize=(22,7))
cor["CGPA"].sort_values(ascending=False).drop(["CGPA"]).plot(kind='bar',color='b')
plt.xlabel("Feature",fontsize=15)
plt.ylabel("Target",fontsize=15)
plt.title("Visualizing the correlation between label and features using bar plot")
plt.show()


This plot shows that thi cs-414 subject grades is best to produce high score in CGPA 

# Separating features and label

In [None]:
x= df1.iloc[:,0:42]
y=df1.iloc[:,-1]

In [None]:
x

No Need For Feature standard scaling as all the Features are already standardized and belong from categorical ordinal data type 

In [None]:
y

# Checking Variance Inflation Factor(VIF)

In [None]:
# Finding variance inflation factor in each scaled column

from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif["VIF values"]=[variance_inflation_factor(x.values, i) for i in range(len(x.columns))]
vif["Features"]=x.columns

vif

As VIF value is in range 8-12 aprrox in all columns there is no out label categories found in particular and limited rows so no features to drop from this step

In [None]:
y.value_counts()

No need to remove oversampling as unique counts is almost equal to total rows

# Modelling

In [None]:
# Finding the best random state

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

In [None]:
# finding the best Random State

maxAccu=0
maxRS=0

for i in range(0,200):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state=i)
    lr=LinearRegression()
    lr.fit(x_train,y_train)
    pred=lr.predict(x_test)
    acc=r2_score(y_test,pred)
    if acc>maxAccu:
        maxAccu=acc
        maxRs=i
        
print("Maximum r2 score is" , maxAccu,"on Random_state", maxRs)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=.20,random_state=maxRS)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.linear_model import Lasso,Ridge


In [None]:
LR = LinearRegression()
LR.fit(x_train,y_train)
predLR=LR.predict(x_test)
pred_train=LR.predict(x_train)
print(LR)
print("R2_score:",r2_score(y_test,predLR))
print("R2 score on Training data:",r2_score(y_train,pred_train)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predLR))
print("Mean Squared Error:",mean_squared_error(y_test,predLR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predLR)))

As the model performed with 100  r2_score is best performance with 4.20 mse lets check with other model techniques

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(x=predLR,y=y_test,color="r")
plt.plot(predLR,predLR,color="b")
plt.xlabel("Actual",fontsize=14)
plt.ylabel("Predicted",fontsize=14)
plt.title("Linear Regression",fontsize=18)
plt.show()

In [None]:
RFR=RandomForestRegressor()
RFR.fit(x_train,y_train)
predRFR=RFR.predict(x_test)
pred_train=RFR.predict(x_train)
print(RFR)
print("R2_score:",r2_score(y_test,predRFR))
print("R2 score on Training data:",r2_score(y_train,pred_train)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predRFR))
print("Mean Squared Error:",mean_squared_error(y_test,predRFR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predRFR)))

In [None]:
knn=KNN()
knn.fit(x_train,y_train)
predknn=knn.predict(x_test)
predtrain=knn.predict(x_train)
print(knn)
print("R2_score:",r2_score(y_test,predknn))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predknn))
print("Mean Squared Error:",mean_squared_error(y_test,predknn))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predknn)))

In [None]:
GBR= GradientBoostingRegressor()
GBR.fit(x_train,y_train)
predGBR=GBR.predict(x_test)
pred_train=GBR.predict(x_train)
print(GBR)
print("R2_score:",r2_score(y_test,predGBR))
print("R2 score on Training data:",r2_score(y_train,pred_train)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predGBR))
print("Mean Squared Error:",mean_squared_error(y_test,predGBR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predGBR)))

In [None]:
lasso=Lasso()
lasso.fit(x_train,y_train)
predlasso=lasso.predict(x_test)
predtrain=lasso.predict(x_train)
print(lasso)
print("R2_score:",r2_score(y_test,predlasso))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predlasso))
print("Mean Squared Error:",mean_squared_error(y_test,predlasso))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predlasso)))

In [None]:
rd=Ridge()
rd.fit(x_train,y_train)
predrd=rd.predict(x_test)
predtrain=rd.predict(x_train)
print(rd)
print("R2_score:",r2_score(y_test,predrd))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predrd))
print("Mean Squared Error:",mean_squared_error(y_test,predrd))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predrd)))

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
preddtr=dtr.predict(x_test)
predtrain=dtr.predict(x_train)
print(dtr)
print("R2_score:",r2_score(y_test,preddtr))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,preddtr))
print("Mean Squared Error:",mean_squared_error(y_test,preddtr))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,preddtr)))

In [None]:
from sklearn.svm import SVR
svr=SVR()
svr.fit(x_train,y_train)
predsvr=svr.predict(x_test)
predtrain=svr.predict(x_train)
print(svr)
print("R2_score:",r2_score(y_test,predsvr))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predsvr))
print("Mean Squared Error:",mean_squared_error(y_test,predsvr))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predsvr)))

In [None]:
#Checking accuracy for ExtraTreesRegressor
from sklearn.ensemble import ExtraTreesRegressor
ET=ExtraTreesRegressor()
ET.fit(x_train,y_train)
predET= ET.predict(x_test)
predtrain=ET.predict(x_train)
print(ET)
print("R2_score:",r2_score(y_test,predET))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predET))
print("Mean Squared Error:",mean_squared_error(y_test,predET))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predET)))

In [None]:
#Checking accuracy for BaggingRegressor
from sklearn.ensemble import BaggingRegressor
BR=BaggingRegressor()
BR.fit(x_train,y_train)
predBR= BR.predict(x_test)
predtrain=BR.predict(x_train)
print(BR)
print("R2_score:",r2_score(y_test,predBR))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predBR))
print("Mean Squared Error:",mean_squared_error(y_test,predBR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predBR)))


In [None]:
#Checking accuracy for AdaBoostRegressor
from sklearn.ensemble import AdaBoostRegressor
ABR=AdaBoostRegressor()
ABR.fit(x_train,y_train)
predABR= ABR.predict(x_test)
predtrain=ABR.predict(x_train)
print(ABR)
print("R2_score:",r2_score(y_test,predABR))
print("R2 score on Training data:",r2_score(y_train,predtrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,predABR))
print("Mean Squared Error:",mean_squared_error(y_test,predABR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,predABR)))


All the model showing high coefficient of determination except KNN,lasso,SVR  .The Best model performance shown by Random Forest Regressor, Gradient Boosting Regressor ExtraTreesRegressor with least Error, 2.LR model is showing great result but MSE is on higher side of Mean Squared Error so also not consider for prediction on unseen dataset


# Spliting train test data using Cross validation score

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
#Checking cv score for Random Forest Regressor
score= cross_val_score(RFR,x,y)
print(score)
print(score.mean())
print("The difference between Accuracy score and Cross validation score: ",r2_score(y_test,predRFR)-score.mean())

In [None]:
#Checking cv score for Extra Tree Regressor
score1= cross_val_score(ET,x,y)
print(score1)
print(score1.mean())
print("The difference between Accuracy score and Cross validation score: ",r2_score(y_test,predET)-score1.mean())

In [None]:
#checking  CV for GBR
score2=cross_val_score(GBR,x,y)
print(score2)
print(score2.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,predGBR)-score2.mean())

In [None]:
#checking  CV for Decision Tree Regressor
score3=cross_val_score(dtr,x,y)
print(score3)
print(score3.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,preddtr)-score3.mean())

In [None]:
#checking  CV for linear Regressor
score4=cross_val_score(LR,x,y)
print(score4)
print(score4.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,predLR)-score4.mean())

In [None]:
#checking  CV for ridge Regressor
score5=cross_val_score(rd,x,y)
print(score5)
print(score5.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,predrd)-score5.mean())

In [None]:
#checking  CV for Ababoosting Regressor
score6=cross_val_score(ABR,x,y)
print(score6)
print(score6.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,predABR)-score6.mean())

In [None]:
#checking  CV for Bagging Regressor
score7=cross_val_score(BR,x,y)
print(score7)
print(score7.mean())
print("Difference between Accuracy score and Cross validation score is :",r2_score(y_test,predBR)-score7.mean())

The Best cross validation result is shown by linear regression and all other regressor model technique as least biasness and high variance data sample provided for training the model with high performance on unseen data

# Hyper Parameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters = {"fit_intercept" :[True, False],
              "copy_X" :[True, False],
              "n_jobs" : [None,1,-1],
              "positive" :[True, False]}      

In [None]:
GCV= GridSearchCV(LinearRegression(),parameters, cv = 5)

In [None]:
GCV.fit(x_train,y_train)

In [None]:
GCV.best_params_

In [None]:
model_lr=LinearRegression(fit_intercept=True, copy_X = True, n_jobs=None, positive=False)
model_lr.fit(x_train,y_train)
prediLR=model_lr.predict(x_test)
preditrain=model_lr.predict(x_train)
print(model_lr)
print("R2 score: ",r2_score(y_test,prediLR))
print("R2 score on training dataset: ",r2_score(y_train,preditrain))
print("Mean Squared Error: ",mean_squared_error(y_test,prediLR))
print("Mean Absolute Error:",mean_absolute_error(y_test,prediLR))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,prediLR)))  

# Saving the model

In [None]:
import pickle
filename1="student_grade1.pkl"
pickle.dump(model_lr,open(filename1,"wb")) 

In [None]:
load_model1=pickle.load(open("student_grade1.pkl","rb"))
result1=load_model1.score(x_test,y_test)
print(result1*100)

In [None]:
conclusion1 =pd.DataFrame([load_model1.predict(x_test)[:],y_test[:]],index=["Predicted","Original"])
conclusion1

In [None]:
#Checking for ExtraTrees Regressor parameters

parameters2 = {"criterion" : ["squared_error","absolute_error","friedman_mse","poisson"],
             "random_state" : [10,50,1000],
             "max_depth" : [None,10,20],
             "n_jobs" : [-2,-1,1],
             "n_estimators" : [50,100,200,300]}

GCV2= GridSearchCV(ExtraTreesRegressor(),parameters2, cv = 5)

In [None]:
GCV2.fit(x_train,y_train)

In [None]:
GCV2.best_params_

In [None]:
# Again checking the parameter given by Grid search CV for Extra Trees Regressor

final_model=ExtraTreesRegressor(criterion="squared_error",random_state=1000,max_depth= None,n_jobs=-2,n_estimators=200)
final_model.fit(x_train,y_train)
prediET=final_model.predict(x_test)
preditrain=final_model.predict(x_train)
print(final_model)
print("R2_score:",r2_score(y_test,prediET))
print("R2 score on Training data:",r2_score(y_train,preditrain)*100)
print("Mean Absolute Error:",mean_absolute_error(y_test,prediET))
print("Mean Squared Error:",mean_squared_error(y_test,prediET))
print("Root Mean Squared Error:",np.sqrt(mean_squared_error(y_test,prediET)))

# Saving the model

In [None]:
import pickle
filename="student_grade.pkl"
pickle.dump(final_model,open(filename,"wb"))# saved model

In [None]:
import pickle
load_model=pickle.load(open("student_grade.pkl","rb"))
result=load_model.score(x_test,y_test)
print(result*100)

In [None]:
conclusion =pd.DataFrame([load_model.predict(x_test)[:],y_test[:]],index=["Predicted","Original"])
conclusion