In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import VotingClassifier

In [2]:
train_data = pd.read_csv("Final Data\\Warning_Train_Data.csv")
test_data = pd.read_csv("Final Data\\Warning_Test_Data.csv")
train_data.head()

Unnamed: 0,STUDENT_ID,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,SEM_7_CGPA,SEM_8_CGPA,SEM_9_CGPA,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,CGPA
0,2B2F31476D36586A566333416F762B387A78423374513D3D,1.84,1.65,1.74,2.13,2.07,2.1,2.14,2.01,1.99,...,Fall 2004,Spring 2009,Others,SSC,other,602.0,HSSC,others,293.0,2.09
1,2B3369304332746C35385A32494E4679677348446B773D3D,1.33,3.0,2.52,2.77,2.92,2.87,2.82,2.82,2.83,...,Fall 2010,Spring 2014,Hafizabad,SSC,usman science school,726.0,HSSC,punjab group of collges,839.0,2.78
2,2B34512B695A45635635486154306A315A69452B54773D3D,2.92,2.45,2.47,2.26,2.21,2.18,1.95,2.0,2.06,...,Fall 2007,Fall 2012,Islamabad,SSC,other,627.0,HSSC,others,758.0,2.18
3,2B357A6373485163476B5A75556F656F536666534E413D3D,0.77,2.37,3.0,2.29,1.91,2.13,2.23,2.06,2.35,...,Fall 2005,Fall 2009,Islamabad,SSC,other,552.0,HSSC,feder board of intermedi and secondari educ,635.0,2.35
4,2B37674B6676503945536C413973305A575A4C5435513D3D,1.87,2.12,2.15,2.12,2.04,1.98,2.1,2.18,2.25,...,Fall 2003,Fall 2007,Rawalpindi,SSC,federal board of intermediate and secondary ed...,655.0,HSSC,feder board of intermedi and secondari educ,805.0,2.25


In [3]:
def data_preprocessing(data):

    ## Encoding data
    labelencoder = LabelEncoder()

    labelencoder.fit(data['GENDER'])
    data['GENDER'] = labelencoder.transform(data['GENDER'])
    gender_class = list(labelencoder.classes_)

    labelencoder.fit(data['BATCH'])
    data['BATCH'] = labelencoder.transform(data['BATCH'])
    batch_class = list(labelencoder.classes_)

    labelencoder.fit(data['CAMPUS'])
    data['CAMPUS'] = labelencoder.transform(data['CAMPUS'])
    campus_class = list(labelencoder.classes_)

    labelencoder.fit(data['PROG_CODE'])
    data['PROG_CODE'] = labelencoder.transform(data['PROG_CODE'])
    prog_code_class = list(labelencoder.classes_)

    labelencoder.fit(data['FIRST_SEM'])
    data['FIRST_SEM'] = labelencoder.transform(data['FIRST_SEM'])
    first_sem_class = list(labelencoder.classes_)

    labelencoder.fit(data['LAST_SEM'])
    data['LAST_SEM'] = labelencoder.transform(data['LAST_SEM'])
    last_sem_class = list(labelencoder.classes_)

    labelencoder.fit(data['CITY'])
    data['CITY'] = labelencoder.transform(data['CITY'])
    city_class = list(labelencoder.classes_)

    labelencoder.fit(data['SECONDARY'])
    data['SECONDARY'] = labelencoder.transform(data['SECONDARY'])
    sec_class = list(labelencoder.classes_)

    labelencoder.fit(data['HIGHER_SECONDARY'])
    data['HIGHER_SECONDARY'] = labelencoder.transform(data['HIGHER_SECONDARY'])
    hig_sec_class = list(labelencoder.classes_)

    labelencoder.fit(data['SCHOOL'])
    data['SCHOOL'] = labelencoder.transform(data['SCHOOL'])
    school_class = list(labelencoder.classes_)

    labelencoder.fit(data['COLLEGE'])
    data['COLLEGE'] = labelencoder.transform(data['COLLEGE'])
    college_class = list(labelencoder.classes_)
    
    ## Rearranging columns
    data = data[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 
                 'GENDER', 'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 
                 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE', 'CGPA']]
    
    return data

In [4]:
train = pd.DataFrame()
train = data_preprocessing(train_data)

test = pd.DataFrame()
test = data_preprocessing(test_data)
test = test.fillna(0)

# Regression 

In [5]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X = (X - X.min()) / (X.max() - X.min())

X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=1)

X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1:]

X_test = (X_test - X_test.min()) / (X_test.max() - X_test.min())

print('Training Data : ', len(X_train))
print("Validation Data : ", len(X_validate))
print('Testing Data : ', len(X_test))

Training Data :  1424
Validation Data :  356
Testing Data :  1283


##  Linear Regression

In [6]:
"""
Linear regression applied on all features i.e: 'SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA',
'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE', 'CGPA'
"""

model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_validate)
model1_acc = r2_score(y_pred, y_validate)
rmse_1 = np.sqrt(mean_squared_error(y_validate, y_pred))
print("R^2:", model1_acc)
print("RMSE:", rmse_1)

R^2: 0.7500072718401063
RMSE: 0.12527195201622007


## Polynomial Regression 

In [7]:
degree = 2
polynomial_features = PolynomialFeatures(degree = degree)

model2 = LinearRegression()

X_train_ = polynomial_features.fit_transform(X_train)
model2.fit(X_train_, y_train)

X_validate_ = polynomial_features.fit_transform(X_validate)
y_pred = model2.predict(X_validate_)

model2_acc = r2_score(y_validate, y_pred)
rmse_2 = np.sqrt(mean_squared_error(y_validate, y_pred))
print("R^2:", model2_acc)
print("RMSE:", rmse_2)

R^2: 0.7902649452403376
RMSE: 0.12559180731955086


# Classification 

In [8]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(pd.cut(train['CGPA'], 5, retbins=True)[0])
print(list(labelencoder.classes_))

[Interval(1.958, 2.328, closed='right'), Interval(2.328, 2.696, closed='right'), Interval(2.696, 3.064, closed='right'), Interval(3.064, 3.432, closed='right'), Interval(3.432, 3.8, closed='right')]


In [9]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.3, random_state=1)

## Gaussian Naive Bayes

In [10]:
model3 = GaussianNB()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_validate)
model3_acc = accuracy_score(y_validate, y_pred)
print("Accuracy:", model3_acc)

Accuracy: 0.7116104868913857


## Logistic Regression

In [11]:
model4 = LogisticRegression(solver='sag', multi_class='multinomial')
model4.fit(X_train, y_train)
model4_acc = model4.score(X_validate, y_validate)
print("Accuracy:", model4_acc)

Accuracy: 0.7247191011235955


## K-Nearest Neighbour

In [12]:
for i in range(1, 10):
    model5 = KNeighborsClassifier(n_neighbors=i) 
    model5.fit(X_train, y_train) 
    y_pred = model5.predict(X_validate) 
    model5_acc = accuracy_score(y_validate, y_pred)
    print(f"{i}-NN - Accuracy Score: {model5_acc}") 

1-NN - Accuracy Score: 0.550561797752809
2-NN - Accuracy Score: 0.5861423220973783
3-NN - Accuracy Score: 0.5636704119850188
4-NN - Accuracy Score: 0.5917602996254682
5-NN - Accuracy Score: 0.5973782771535581
6-NN - Accuracy Score: 0.6086142322097379
7-NN - Accuracy Score: 0.5898876404494382
8-NN - Accuracy Score: 0.6254681647940075
9-NN - Accuracy Score: 0.5936329588014981


## Decision Tree Classifier

In [13]:
model6 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.015)
model6.fit(X_train, y_train)
y_pred = model6.predict(X_validate)
model6_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model6_acc)

Accuracy:  0.7247191011235955


## Random Forest Classifier

In [14]:
model7 = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', criterion = 'entropy')
model7.fit(X_train, y_train)
y_pred = model7.predict(X_validate)
model7_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model7_acc)

Accuracy:  0.7640449438202247


## Support Vector Machine

In [15]:
model8 = SVC(kernel="linear", C=1)
model8.fit(X_train, y_train)
y_pred = model8.predict(X_validate)
model8_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model8_acc)

Accuracy:  0.7378277153558053


## Ensemble Classifier

In [16]:
final = VotingClassifier(estimators=[('gnb', model3), 
                                     ('lgr', model4), 
                                     ('knn', model5), 
                                     ('dt', model6), 
                                     ('rf', model7),
                                     ('svm', model8)], 
                         voting='hard')

final.fit(X_train, y_train)
y_pred = final.predict(X_validate)
final_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", final_acc)

Accuracy:  0.7434456928838952


# Results

In [17]:
print("=========Accuracy==========\n")

print("Linear Regression")
print("R-Squared:", round(model1_acc, 3))
print("RMSE: ", round(rmse_1, 3))

print("\nPolynmial Regression with degree 2")
print("R-Squared:", round(model2_acc, 3))
print("RMSE: ", round(rmse_2, 3))

print("\nGaussian Naive Bayes:", round(model3_acc, 3))
print("Logistic Regression:", round(model4_acc, 3))
print("K - Nearest Neighbor:", round(model5_acc, 3))
print("Decision Tree Classifier:", round(model6_acc, 3))
print("Random Forest Classifier:", round(model7_acc, 3))
print("Support Vector Machine:", round(model8_acc, 3))
print("Ensemble Classifier:", round(final_acc, 3))


Linear Regression
R-Squared: 0.75
RMSE:  0.125

Polynmial Regression with degree 2
R-Squared: 0.79
RMSE:  0.126

Gaussian Naive Bayes: 0.712
Logistic Regression: 0.725
K - Nearest Neighbor: 0.594
Decision Tree Classifier: 0.725
Random Forest Classifier: 0.764
Support Vector Machine: 0.738
Ensemble Classifier: 0.743


# Classification using Hold-Out Approach Repeated 100 times

In [18]:
CV = StratifiedShuffleSplit(n_splits = 100, test_size = 0.3, random_state = 0)

## Gaussian Naive Bayes

In [19]:
model3 = GaussianNB()
model3_acc = cross_val_score(model3, X_train, y_train, cv=CV)

## Logistic Regression

In [20]:
model4 = LogisticRegression(solver='sag', multi_class='multinomial')
model4_acc = cross_val_score(model4, X_train, y_train, cv=CV)

## K-Nearest Neighbour

In [21]:
model5 = KNeighborsClassifier(n_neighbors=3)
model5_acc = cross_val_score(model5, X_train, y_train, cv=CV)

## Decision Tree Classifier

In [22]:
model6 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.015)
model6_acc = cross_val_score(model6, X_train, y_train, cv=CV, n_jobs = -1)

## Random Forest Classifier

In [23]:
model7 = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', criterion = 'entropy')
model7_acc = cross_val_score(model7, X_train, y_train, cv=CV, n_jobs = -1)

## Support Vector Machine

In [24]:
model8 = SVC(kernel="linear", C=1)
model8_acc = cross_val_score(model6, X_train, y_train, cv=CV, n_jobs = -1)

## Ensemble Classifier

In [25]:
final = VotingClassifier(estimators=[('gnb', model3), 
                                     ('lgr', model4), 
                                     ('knn', model5), 
                                     ('dt', model6), 
                                     ('rf', model7),
                                     ('svm', model8)], 
                         voting='hard')

final_acc = cross_val_score(final, X_train, y_train, cv=CV, n_jobs = -1)

# Results of Classification using Hold-out Repeated 10 times

In [26]:
print("==========================Accuracy=========================\n")

print("Gaussian Naive Bayes:")
print(f"Mean Score: {np.round(np.mean(model3_acc), 3)}    Standard Deviation: {np.round(np.std(model3_acc), 3)}")

print("\nLogistic Regression:")
print(f"Mean Score: {np.round(np.mean(model4_acc), 3)}    Standard Deviation: {np.round(np.std(model4_acc), 3)}")

print("\nK - Nearest Neighbor:")
print(f"Mean Score: {np.round(np.mean(model5_acc), 3)}    Standard Deviation: {np.round(np.std(model5_acc), 3)}")

print("\nDecision Tree Classifier:")
print(f"Mean Score: {np.round(np.mean(model6_acc), 3)}    Standard Deviation: {np.round(np.std(model6_acc), 3)}")

print("\nRandom Forest Classifier:")
print(f"Mean Score: {np.round(np.mean(model7_acc), 3)}    Standard Deviation: {np.round(np.std(model7_acc), 3)}")

print("\nSupport Vector Machine:")
print(f"Mean Score: {np.round(np.mean(model8_acc), 3)}    Standard Deviation: {np.round(np.std(model8_acc), 3)}")

print("\nEnsemble Classifier:")
print(f"Mean Score: {np.round(np.mean(final_acc), 3)}     Standard Deviation: {np.round(np.std(final_acc), 3)}")


Gaussian Naive Bayes:
Mean Score: 0.685    Standard Deviation: 0.022

Logistic Regression:
Mean Score: 0.692    Standard Deviation: 0.02

K - Nearest Neighbor:
Mean Score: 0.54    Standard Deviation: 0.022

Decision Tree Classifier:
Mean Score: 0.732    Standard Deviation: 0.019

Random Forest Classifier:
Mean Score: 0.739    Standard Deviation: 0.019

Support Vector Machine:
Mean Score: 0.732    Standard Deviation: 0.019

Ensemble Classifier:
Mean Score: 0.723     Standard Deviation: 0.021


# Predictive Modeling

In [27]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X_test = test.iloc[:, :-1]

In [28]:
X_ = X[['SEM_1_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL',
        'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_2 = X_test[X_test["TOTAL_SEM"] == 1]
test_sem_2 = test_sem_2[['SEM_1_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
                         'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("Predict CGPA of 2nd Semester\nTotal Records:", len(test_sem_2))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_2), 2)
test_sem_2['SEM_2_CGPA'] = y_pred
test_sem_2.head()

Predict CGPA of 2nd Semester
Total Records: 827


Unnamed: 0,SEM_1_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_2_CGPA
0,1.26,1,1,7,3,4,7,1,31,1,10,982,1,15,866.0,3.25
1,1.94,1,1,7,2,2,7,3,25,1,20,755,2,13,756.84,3.09
2,0.63,1,1,7,4,2,7,3,25,1,16,635,1,15,648.0,3.23
3,0.0,1,1,7,1,2,7,1,17,1,13,979,1,15,920.0,3.27
4,1.41,1,1,7,3,2,7,3,26,1,18,1011,1,15,840.0,3.25


In [29]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
        'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_3 = X_test[X_test["TOTAL_SEM"] == 2]
test_sem_3 = test_sem_3[['SEM_1_CGPA', 'SEM_2_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 
                         'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 3rd Semester\nTotal Records:", len(test_sem_3))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_3), 2)
test_sem_3['SEM_3_CGPA'] = y_pred
test_sem_3.head()


Predict CGPA of 3rd Semester
Total Records: 58


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_3_CGPA
20,1.2,1.27,2,1,5,2,2,5,0,66,1,24,733,2,13,756.84,2.86
52,1.31,1.68,2,1,6,3,2,6,2,31,1,18,1067,1,15,1008.0,3.04
113,0.58,1.33,2,1,6,2,4,10,3,33,1,20,628,1,9,958.0,2.85
156,1.22,0.92,2,1,6,1,2,6,2,6,1,26,949,1,15,887.0,2.79
172,0.29,0.29,2,1,6,2,4,6,2,25,0,32,585,0,18,685.0,2.68


In [30]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 
        'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_4 = X_test[X_test["TOTAL_SEM"] == 3]
test_sem_4 = test_sem_4[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 
                         'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 
                         'HIG_SEC_GRADE']]

print("\nPredict CGPA of 4th Semester\nTotal Records:", len(test_sem_4))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_4), 2)
test_sem_4['SEM_4_CGPA'] = y_pred
test_sem_4.head()


Predict CGPA of 4th Semester
Total Records: 111


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_4_CGPA
12,1.39,1.97,1.72,3,1,6,1,2,6,1,13,0,32,675,0,18,880.0,2.73
18,2.58,2.14,1.84,3,1,6,3,0,6,1,21,0,32,737,0,18,890.0,2.84
28,0.0,1.4,2.0,3,1,4,2,4,9,1,25,0,32,641,2,13,756.84,2.81
32,0.87,1.39,1.7,3,1,6,2,4,6,1,31,1,1,928,1,7,753.0,2.7
34,1.78,2.4,1.73,3,1,6,4,4,6,3,40,1,10,861,1,9,806.0,2.8


In [31]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 
        'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_5 = X_test[X_test["TOTAL_SEM"] == 4]
test_sem_5 = test_sem_5[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS',
                         'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 
                         'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 5th Semester\nTotal Records:", len(test_sem_5))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_5), 2)
test_sem_5['SEM_5_CGPA'] = y_pred
test_sem_5.head()


Predict CGPA of 5th Semester
Total Records: 61


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_5_CGPA
27,1.28,1.75,2.03,1.63,4,1,6,2,2,6,3,25,1,16,698,1,15,792.0,2.44
57,2.35,1.86,2.12,1.65,4,1,6,0,2,6,3,26,1,18,960,1,15,758.0,2.44
59,1.71,2.33,1.69,1.86,4,1,5,3,4,5,2,15,1,13,921,1,15,866.0,2.57
82,2.0,1.84,2.04,1.46,4,1,5,1,1,5,0,21,1,10,844,1,6,712.0,2.37
85,1.1,1.87,2.3,1.83,4,1,6,1,2,6,3,21,1,26,948,1,15,832.0,2.57


In [32]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS',
        'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 
        'HIG_SEC_GRADE']]

test_sem_6 = X_test[X_test["TOTAL_SEM"] == 5]
test_sem_6 = test_sem_6[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 
                         'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 
                         'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 6th Semester\nTotal Records:", len(test_sem_6))

model = LinearRegression() 
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_6), 2)
test_sem_6['SEM_6_CGPA'] = y_pred
test_sem_6.head()


Predict CGPA of 6th Semester
Total Records: 60


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_6_CGPA
13,1.71,2.28,2.02,1.72,1.61,5,1,5,3,4,5,3,57,1,29,971,1,17,869.0,2.32
23,1.74,2.02,1.75,1.69,1.74,5,1,5,2,2,5,2,25,0,32,698,2,13,756.84,2.31
71,0.25,2.06,1.71,1.55,1.75,5,1,5,1,0,5,1,21,1,14,680,1,15,794.0,2.31
84,2.02,2.03,2.14,2.07,1.71,5,1,5,4,2,5,1,65,1,9,933,1,15,750.0,2.43
96,1.65,2.23,2.32,2.06,1.97,5,1,5,1,2,5,3,21,0,32,731,0,18,845.0,2.49


In [33]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 
        'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 
        'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_7 = X_test[X_test["TOTAL_SEM"] == 6]
test_sem_7 = test_sem_7[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'TOTAL_SEM', 
                         'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL',
                         'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 7th Semester\nTotal Records:", len(test_sem_7))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_7), 2)
test_sem_7['SEM_7_CGPA'] = y_pred
test_sem_7.head()


Predict CGPA of 7th Semester
Total Records: 40


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_7_CGPA
24,0.75,1.65,2.04,1.88,2.17,1.85,6,1,5,4,...,5,3,51,1,33,920,1,15,757.0,2.3
35,1.63,1.68,2.3,1.92,2.05,1.79,6,1,5,3,...,5,3,4,1,1,919,1,7,761.0,2.24
44,2.04,2.08,2.36,1.96,1.91,2.02,6,1,5,0,...,5,1,49,1,13,901,1,15,842.0,2.33
49,1.5,2.5,2.18,1.81,1.88,1.88,6,1,5,1,...,5,3,21,1,10,973,1,6,677.0,2.29
133,1.88,2.75,2.03,1.99,2.05,1.99,6,1,5,3,...,5,3,31,1,18,983,1,15,895.0,2.39


In [34]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 'GENDER', 
        'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 
        'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_8 = X_test[X_test["TOTAL_SEM"] == 7]
test_sem_8 = test_sem_8[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 
                         'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 
                         'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 8th Semester\nTotal Records:", len(test_sem_8))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_8), 2)
test_sem_8['SEM_8_CGPA'] = y_pred
test_sem_8.head()


Predict CGPA of 8th Semester
Total Records: 33


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,SEM_7_CGPA,TOTAL_SEM,GENDER,BATCH,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_8_CGPA
42,2.9,2.13,2.32,2.18,2.09,1.95,1.69,7,1,4,...,4,2,66,1,24,709,1,15,919.0,2.05
65,1.43,1.81,2.39,1.9,1.79,2.02,1.9,7,1,4,...,4,3,13,1,1,900,1,13,810.0,2.22
125,2.58,2.23,2.15,2.21,2.19,2.16,1.99,7,1,4,...,4,3,31,1,18,818,1,15,620.0,2.29
130,1.31,1.59,2.29,1.72,2.1,1.96,1.82,7,1,4,...,4,2,58,1,18,817,1,15,761.0,2.18
139,1.63,2.0,1.71,1.71,2.03,2.03,1.57,7,1,5,...,5,3,31,1,3,911,1,15,715.0,1.95
