In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedShuffleSplit

from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import VotingClassifier

In [2]:
train_data = pd.read_csv("Final Data\\Training_Data.csv")
test_data = pd.read_csv("Final Data\\Testing_Data.csv")
train_data.head()

Unnamed: 0,STUDENT_ID,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,SEM_7_CGPA,SEM_8_CGPA,SEM_9_CGPA,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,CGPA
0,2B2B36356C324C42466B435039537875512B307654673D3D,1.31,1.8,2.77,1.76,2.21,2.19,2.38,2.4,2.33,...,Fall 2014,Spring 2019,Sargodha,SSC,sargodha,784.0,HSSC,federal board,755.0,2.45
1,2B2B36514F6F74393035314E7836646661506F7849513D3D,2.35,2.44,2.4,2.37,2.29,2.28,2.37,2.41,0.0,...,Fall 2004,Spring 2008,Rawalpindi,SSC,rawalpindi,697.0,HSSC,feder board of intermedi and secondari educ,827.0,2.41
2,2B2B76586143454D6E4C6B6B6C46796F68774B6D70773D3D,2.09,2.48,2.53,2.46,2.47,2.53,2.61,2.6,2.59,...,Fall 2009,Spring 2013,Lahore,OLEVEL,other,335.0,ALEVEL,others,657.0,2.59
3,2B2F616B6E314F5859514A4739766B436A6869626D513D3D,2.86,2.95,2.86,2.86,2.9,2.94,2.94,2.95,0.0,...,Fall 2005,Spring 2009,Rawalpindi,SSC,other,657.0,HSSC,feder board of intermedi and secondari educ,722.0,2.95
4,2B2F633565686855486D6E6361626951365943465A773D3D,3.04,3.11,3.31,3.36,3.42,3.39,3.43,3.49,0.0,...,Fall 2015,Spring 2019,Rawalpindi,SSC,feder,873.0,HSSC,feder,752.0,3.49


In [3]:
def data_preprocessing(data):

    ## Encoding data
    labelencoder = LabelEncoder()

    labelencoder.fit(data['GENDER'])
    data['GENDER'] = labelencoder.transform(data['GENDER'])
    gender_class = list(labelencoder.classes_)

    labelencoder.fit(data['BATCH'])
    data['BATCH'] = labelencoder.transform(data['BATCH'])
    batch_class = list(labelencoder.classes_)

    labelencoder.fit(data['CAMPUS'])
    data['CAMPUS'] = labelencoder.transform(data['CAMPUS'])
    campus_class = list(labelencoder.classes_)

    labelencoder.fit(data['PROG_CODE'])
    data['PROG_CODE'] = labelencoder.transform(data['PROG_CODE'])
    prog_code_class = list(labelencoder.classes_)

    labelencoder.fit(data['FIRST_SEM'])
    data['FIRST_SEM'] = labelencoder.transform(data['FIRST_SEM'])
    first_sem_class = list(labelencoder.classes_)

    labelencoder.fit(data['LAST_SEM'])
    data['LAST_SEM'] = labelencoder.transform(data['LAST_SEM'])
    last_sem_class = list(labelencoder.classes_)

    labelencoder.fit(data['CITY'])
    data['CITY'] = labelencoder.transform(data['CITY'])
    city_class = list(labelencoder.classes_)

    labelencoder.fit(data['SECONDARY'])
    data['SECONDARY'] = labelencoder.transform(data['SECONDARY'])
    sec_class = list(labelencoder.classes_)

    labelencoder.fit(data['HIGHER_SECONDARY'])
    data['HIGHER_SECONDARY'] = labelencoder.transform(data['HIGHER_SECONDARY'])
    hig_sec_class = list(labelencoder.classes_)

    labelencoder.fit(data['SCHOOL'])
    data['SCHOOL'] = labelencoder.transform(data['SCHOOL'])
    school_class = list(labelencoder.classes_)

    labelencoder.fit(data['COLLEGE'])
    data['COLLEGE'] = labelencoder.transform(data['COLLEGE'])
    college_class = list(labelencoder.classes_)
    
    ## Rearranging columns
    data = data[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 
                 'GENDER', 'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 
                 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE', 'CGPA']]
    
    return data

In [4]:
train = pd.DataFrame()
train = data_preprocessing(train_data)

test = pd.DataFrame()
test = data_preprocessing(test_data)
test = test.fillna(0)

# Regression 

In [5]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X = (X - X.min()) / (X.max() - X.min())

X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=1)

print('Training Data : ', len(X_train))
print("Validation Data : ", len(X_validate))

Training Data :  11316
Validation Data :  2829


##  Linear Regression

In [6]:
"""
Linear regression applied on all features i.e: 'SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA',
'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE', 'CGPA'
"""

model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred = model1.predict(X_validate)
model1_acc = r2_score(y_pred, y_validate)
rmse_1 = np.sqrt(mean_squared_error(y_validate, y_pred))
print("R^2:", model1_acc)
print("RMSE:", rmse_1)

R^2: 0.959058791017585
RMSE: 0.09044692581324311


## Polynomial Regression 

In [7]:
degree = 2
polynomial_features = PolynomialFeatures(degree = degree)

model2 = LinearRegression()

X_train_ = polynomial_features.fit_transform(X_train)
model2.fit(X_train_, y_train)

X_validate_ = polynomial_features.fit_transform(X_validate)
y_pred = model2.predict(X_validate_)

model2_acc = r2_score(y_validate, y_pred)
rmse_2 = np.sqrt(mean_squared_error(y_validate, y_pred))
print("R^2:", model2_acc)
print("RMSE:", rmse_2)

R^2: 0.9663044360673213
RMSE: 0.08433636863916234


# Classification 

In [8]:
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(pd.cut(train['CGPA'], 5, retbins=True)[0])
print(list(labelencoder.classes_))

[Interval(1.998, 2.4, closed='right'), Interval(2.4, 2.8, closed='right'), Interval(2.8, 3.2, closed='right'), Interval(3.2, 3.6, closed='right'), Interval(3.6, 4.0, closed='right')]


In [9]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.3, random_state=1)

## Gaussian Naive Bayes

In [10]:
model3 = GaussianNB()
model3.fit(X_train, y_train)
y_pred = model3.predict(X_validate)
model3_acc = accuracy_score(y_validate, y_pred)
print("Accuracy:", model3_acc)

Accuracy: 0.7905278039585297


## Logistic Regression

In [11]:
model4 = LogisticRegression(solver='sag', multi_class='multinomial')
model4.fit(X_train, y_train)
model4_acc = model4.score(X_validate, y_validate)
print("Accuracy:", model4_acc)

Accuracy: 0.8261074458058435


## K-Nearest Neighbour

In [12]:
for i in range(1, 10):
    model5 = KNeighborsClassifier(n_neighbors=i) 
    model5.fit(X_train, y_train) 
    y_pred = model5.predict(X_validate) 
    model5_acc = accuracy_score(y_validate, y_pred)
    print(f"{i}-NN - Accuracy Score: {model5_acc}") 

1-NN - Accuracy Score: 0.6623468426013195
2-NN - Accuracy Score: 0.6588124410933082
3-NN - Accuracy Score: 0.6875589066918002
4-NN - Accuracy Score: 0.6910933081998115
5-NN - Accuracy Score: 0.6981621112158342
6-NN - Accuracy Score: 0.6986333647502356
7-NN - Accuracy Score: 0.7057021677662583
8-NN - Accuracy Score: 0.7094721960414703
9-NN - Accuracy Score: 0.7108859566446748


## Decision Tree Classifier

In [13]:
model6 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.015)
model6.fit(X_train, y_train)
y_pred = model6.predict(X_validate)
model6_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model6_acc)

Accuracy:  0.8631008482563619


## Random Forest Classifier

In [14]:
model7 = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', criterion = 'entropy')
model7.fit(X_train, y_train)
y_pred = model7.predict(X_validate)
model7_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model7_acc)

Accuracy:  0.8598020735155514


## Support Vector Machine

In [15]:
model8 = SVC(kernel="linear", C=1)
model8.fit(X_train, y_train)
y_pred = model8.predict(X_validate)
model8_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", model8_acc)

Accuracy:  0.8470782280867106


## Ensemble Classifier

In [16]:
final = VotingClassifier(estimators=[('gnb', model3), 
                                     ('lgr', model4), 
                                     ('knn', model5), 
                                     ('dt', model6), 
                                     ('rf', model7),
                                     ('svm', model8)], 
                         voting='hard')

final.fit(X_train, y_train)
y_pred = final.predict(X_validate)
final_acc = accuracy_score(y_validate, y_pred)
print("Accuracy: ", final_acc)

Accuracy:  0.8456644674835061


# Results

In [17]:
print("=========Accuracy==========\n")

print("Linear Regression")
print("R-Squared:", round(model1_acc, 3))
print("RMSE: ", round(rmse_1, 3))

print("\nPolynmial Regression with degree 2")
print("R-Squared:", round(model2_acc, 3))
print("RMSE: ", round(rmse_2, 3))

print("\nGaussian Naive Bayes:", round(model3_acc, 3))
print("Logistic Regression:", round(model4_acc, 3))
print("K - Nearest Neighbor:", round(model5_acc, 3))
print("Decision Tree Classifier:", round(model6_acc, 3))
print("Random Forest Classifier:", round(model7_acc, 3))
print("Support Vector Machine:", round(model8_acc, 3))
print("Ensemble Classifier:", round(final_acc, 3))


Linear Regression
R-Squared: 0.959
RMSE:  0.09

Polynmial Regression with degree 2
R-Squared: 0.966
RMSE:  0.084

Gaussian Naive Bayes: 0.791
Logistic Regression: 0.826
K - Nearest Neighbor: 0.711
Decision Tree Classifier: 0.863
Random Forest Classifier: 0.86
Support Vector Machine: 0.847
Ensemble Classifier: 0.846


# Classification using Hold-Out Approach Repeated 10 times

In [18]:
CV = StratifiedShuffleSplit(n_splits = 10, test_size = 0.3, random_state = 0)

## Gaussian Naive Bayes

In [19]:
model3 = GaussianNB()
model3_acc = cross_val_score(model3, X_train, y_train, cv=CV)
print("Accuracy:", model3_acc)

Accuracy: [0.77347694 0.78020868 0.7798721  0.78593066 0.78626725 0.77280377
 0.77953551 0.78121844 0.78256479 0.78694042]


## Logistic Regression

In [20]:
model4 = LogisticRegression(solver='sag', multi_class='multinomial')
model4_acc = cross_val_score(model4, X_train, y_train, cv=CV)
print("Accuracy: ", model4_acc)

Accuracy:  [0.80276001 0.81252104 0.81386738 0.81555032 0.81925278 0.80410636
 0.8135308  0.81992595 0.81218445 0.82329182]


## K-Nearest Neighbour

In [21]:
model5 = KNeighborsClassifier(n_neighbors=3)
model5_acc = cross_val_score(model5, X_train, y_train, cv=CV)
print("Accuracy: ", model5_acc)

Accuracy:  [0.65196903 0.67149108 0.65937395 0.65903736 0.66610569 0.66341299
 0.66778862 0.67788623 0.66509593 0.66677886]


## Decision Tree Classifier

In [22]:
model6 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.015)
model6_acc = cross_val_score(model6, X_train, y_train, cv=CV, n_jobs = -1)
print("Accuracy: ", model6_acc)

Accuracy:  [0.84483339 0.85594076 0.8599798  0.84853585 0.86368226 0.84079435
 0.8522383  0.84786267 0.85291148 0.83641871]


## Random Forest Classifier

In [23]:
model7 = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt', criterion = 'entropy')
model7_acc = cross_val_score(model7, X_train, y_train, cv=CV, n_jobs = -1)
print("Accuracy: ", model7_acc)

Accuracy:  [0.85291148 0.86368226 0.86637496 0.85156513 0.86334567 0.84584315
 0.86401885 0.85863346 0.86132615 0.86233591]


## Support Vector Machine

In [24]:
model8 = SVC(kernel="linear", C=1)
model8_acc = cross_val_score(model6, X_train, y_train, cv=CV, n_jobs = -1)
print("Accuracy: ", model8_acc)

Accuracy:  [0.84483339 0.85594076 0.8599798  0.84853585 0.86368226 0.84079435
 0.8522383  0.84786267 0.85291148 0.83641871]


## Ensemble Classifier

In [25]:
final = VotingClassifier(estimators=[('gnb', model3), 
                                     ('lgr', model4), 
                                     ('knn', model5), 
                                     ('dt', model6), 
                                     ('rf', model7),
                                     ('svm', model8)], 
                         voting='hard')

final_acc = cross_val_score(final, X_train, y_train, cv=CV, n_jobs = -1)
print("Accuracy: ", final_acc)

Accuracy:  [0.82867721 0.8471895  0.84617974 0.84180411 0.8471895  0.82968697
 0.84685291 0.84180411 0.83540895 0.84079435]


# Results of Classification using Hold-out Repeated 10 times

In [26]:
print("==========================Accuracy=========================\n")

print("Gaussian Naive Bayes:", np.around(model3_acc, 3))
print(f"Mean Score: {np.round(np.mean(model3_acc), 3)}    Standard Deviation: {np.round(np.std(model3_acc), 3)}")

print("\nLogistic Regression:", np.around(model4_acc, 3))
print(f"Mean Score: {np.round(np.mean(model4_acc), 3)}    Standard Deviation: {np.round(np.std(model4_acc), 3)}")

print("\nK - Nearest Neighbor:", np.around(model5_acc, 3))
print(f"Mean Score: {np.round(np.mean(model5_acc), 3)}    Standard Deviation: {np.round(np.std(model5_acc), 3)}")

print("\nDecision Tree Classifier:", np.around(model6_acc, 3))
print(f"Mean Score: {np.round(np.mean(model6_acc), 3)}    Standard Deviation: {np.round(np.std(model6_acc), 3)}")

print("\nRandom Forest Classifier:", np.around(model7_acc, 3))
print(f"Mean Score: {np.round(np.mean(model7_acc), 3)}    Standard Deviation: {np.round(np.std(model7_acc), 3)}")

print("\nSupport Vector Machine:", np.around(model8_acc, 3))
print(f"Mean Score: {np.round(np.mean(model8_acc), 3)}    Standard Deviation: {np.round(np.std(model8_acc), 3)}")

print("\nEnsemble Classifier:", np.around(final_acc, 3))
print(f"Mean Score: {np.round(np.mean(final_acc), 3)}     Standard Deviation: {np.round(np.std(final_acc), 3)}")


Gaussian Naive Bayes: [0.773 0.78  0.78  0.786 0.786 0.773 0.78  0.781 0.783 0.787]
Mean Score: 0.781    Standard Deviation: 0.005

Logistic Regression: [0.803 0.813 0.814 0.816 0.819 0.804 0.814 0.82  0.812 0.823]
Mean Score: 0.814    Standard Deviation: 0.006

K - Nearest Neighbor: [0.652 0.671 0.659 0.659 0.666 0.663 0.668 0.678 0.665 0.667]
Mean Score: 0.665    Standard Deviation: 0.007

Decision Tree Classifier: [0.845 0.856 0.86  0.849 0.864 0.841 0.852 0.848 0.853 0.836]
Mean Score: 0.85    Standard Deviation: 0.008

Random Forest Classifier: [0.853 0.864 0.866 0.852 0.863 0.846 0.864 0.859 0.861 0.862]
Mean Score: 0.859    Standard Deviation: 0.006

Support Vector Machine: [0.845 0.856 0.86  0.849 0.864 0.841 0.852 0.848 0.853 0.836]
Mean Score: 0.85    Standard Deviation: 0.008

Ensemble Classifier: [0.829 0.847 0.846 0.842 0.847 0.83  0.847 0.842 0.835 0.841]
Mean Score: 0.841     Standard Deviation: 0.007


# Principal Component Analysis

In [27]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(X_train)
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)

[0.23006412 0.15975953 0.13949244 0.109623   0.08277403 0.06267701
 0.04878813 0.04196031 0.02978696 0.02715582 0.02066243 0.0185438
 0.01186001 0.00475256 0.00423499 0.00221978 0.0016118  0.00152454
 0.00121146 0.00082864 0.00046864]
[0.24809946 0.1722835  0.15042762 0.11821664 0.0892629  0.06759042
 0.05261276 0.04524969 0.03212204 0.02928463 0.02228222 0.0199975
 0.01278975 0.00512512 0.00456698 0.0023938  0.00173815 0.00164405
 0.00130642 0.0008936  0.00050537]


# Predictive Modeling

In [28]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

X_test = test.iloc[:, :-1]

In [29]:
X_ = X[['SEM_1_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL',
        'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_2 = X_test[X_test["TOTAL_SEM"] == 1]
test_sem_2 = test_sem_2[['SEM_1_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
                         'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("Predict CGPA of 2nd Semester\nTotal Records:", len(test_sem_2))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_2), 2)
test_sem_2['SEM_2_CGPA'] = y_pred
test_sem_2.head()

Predict CGPA of 2nd Semester
Total Records: 1719


Unnamed: 0,SEM_1_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_2_CGPA
1,2.18,1,1,7,3,4,7,4,84,0,66,748.0,0,39,880.0,3.29
5,2.51,1,1,7,4,2,7,4,46,2,46,824.0,1,21,837.0,3.36
7,3.29,1,1,7,2,2,7,4,38,0,66,4.0,0,39,4.0,3.38
8,3.45,1,1,7,3,2,7,4,49,0,66,641.0,0,39,795.0,3.69
9,2.86,1,0,7,1,2,7,4,32,0,66,759.0,0,39,870.0,3.62


In [30]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 
        'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_3 = X_test[X_test["TOTAL_SEM"] == 2]
test_sem_3 = test_sem_3[['SEM_1_CGPA', 'SEM_2_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 
                         'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 3rd Semester\nTotal Records:", len(test_sem_3))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_3), 2)
test_sem_3['SEM_3_CGPA'] = y_pred
test_sem_3.head()


Predict CGPA of 3rd Semester
Total Records: 43


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_3_CGPA
195,1.8,2.53,2,1,5,3,2,5,0,93,2,34,969.0,1,3,898.0,3.09
512,3.47,3.45,2,0,6,0,2,6,4,20,0,66,804.0,0,16,960.0,3.74
793,3.39,2.95,2,1,6,2,4,6,3,38,0,66,664.0,0,39,755.0,3.37
801,3.29,2.95,2,1,6,3,0,6,3,49,0,66,737.0,0,39,810.0,3.36
961,2.54,2.49,2,1,6,3,0,6,3,49,2,61,750.0,1,33,663.0,3.05


In [31]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 
        'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_4 = X_test[X_test["TOTAL_SEM"] == 3]
test_sem_4 = test_sem_4[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 
                         'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 
                         'HIG_SEC_GRADE']]

print("\nPredict CGPA of 4th Semester\nTotal Records:", len(test_sem_4))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_4), 2)
test_sem_4['SEM_4_CGPA'] = y_pred
test_sem_4.head()


Predict CGPA of 4th Semester
Total Records: 1591


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_4_CGPA
11,2.52,2.47,2.37,3,1,6,1,0,6,4,77,2,59,937.0,1,33,901.0,2.88
15,2.82,3.0,3.03,3,1,6,2,2,6,4,38,0,66,698.0,0,39,865.0,3.31
16,3.57,3.55,3.68,3,1,6,1,2,6,4,77,2,59,899.0,1,33,889.0,3.78
21,2.83,2.55,2.61,3,0,6,3,0,6,4,49,0,66,737.0,0,39,850.0,3.07
22,3.25,2.93,2.96,3,1,6,1,0,6,4,77,2,59,1024.0,1,33,943.0,3.29


In [32]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 
        'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_5 = X_test[X_test["TOTAL_SEM"] == 4]
test_sem_5 = test_sem_5[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS',
                         'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 
                         'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 5th Semester\nTotal Records:", len(test_sem_5))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_5), 2)
test_sem_5['SEM_5_CGPA'] = y_pred
test_sem_5.head()


Predict CGPA of 5th Semester
Total Records: 351


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_5_CGPA
6,2.31,2.22,2.37,2.22,4,0,6,0,2,6,4,49,2,47,890.0,1,33,868.0,2.58
37,0.0,3.31,3.38,2.75,4,1,6,3,3,6,4,49,0,66,765.0,0,39,885.0,2.95
42,2.53,2.42,2.6,2.57,4,1,6,1,2,6,4,73,0,66,731.0,0,39,885.0,2.8
81,1.88,1.75,2.25,2.14,4,1,6,1,2,6,4,77,2,1,874.0,1,33,822.0,2.47
108,2.24,2.32,2.53,2.86,4,1,6,1,2,6,4,38,2,44,773.0,1,33,832.0,3.01


In [33]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS',
        'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 
        'HIG_SEC_GRADE']]

test_sem_6 = X_test[X_test["TOTAL_SEM"] == 5]
test_sem_6 = test_sem_6[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 
                         'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL','SEC_GRADE', 
                         'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 6th Semester\nTotal Records:", len(test_sem_6))

model = LinearRegression() 
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_6), 2)
test_sem_6['SEM_6_CGPA'] = y_pred
test_sem_6.head()


Predict CGPA of 6th Semester
Total Records: 1285


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,PROG_CODE,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_6_CGPA
0,1.47,2.25,2.21,2.11,2.06,5,1,5,2,2,5,4,2,2,40,603.0,2,28,756.84,2.31
3,3.92,3.83,3.9,3.91,3.86,5,0,5,3,0,5,4,49,0,66,810.0,0,39,945.0,3.87
4,3.65,3.44,3.5,3.6,3.64,5,0,5,0,4,5,4,20,2,34,1026.0,1,33,945.0,3.7
14,3.1,2.74,2.43,2.32,2.39,5,1,5,3,2,5,4,24,2,39,843.0,1,33,920.0,2.59
25,3.23,3.0,2.76,2.58,2.51,5,1,5,0,2,5,4,20,2,6,929.0,1,15,868.0,2.71


In [34]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'TOTAL_SEM', 'GENDER', 'BATCH', 
        'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 
        'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_7 = X_test[X_test["TOTAL_SEM"] == 6]
test_sem_7 = test_sem_7[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'TOTAL_SEM', 
                         'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL',
                         'SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 7th Semester\nTotal Records:", len(test_sem_7))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_7), 2)
test_sem_7['SEM_7_CGPA'] = y_pred
test_sem_7.head()


Predict CGPA of 7th Semester
Total Records: 416


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,TOTAL_SEM,GENDER,BATCH,CAMPUS,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_7_CGPA
2,1.17,2.1,1.85,2.08,2.07,2.12,6,0,5,1,...,5,4,95,2,47,905.0,1,16,710.0,2.35
17,2.73,2.76,2.52,1.94,2.03,2.0,6,1,5,3,...,5,4,49,2,47,992.0,1,33,855.0,2.17
31,2.24,1.64,2.09,2.2,2.39,2.41,6,1,5,1,...,5,4,32,0,66,759.0,0,39,890.0,2.58
41,2.29,1.78,2.22,2.27,2.48,2.66,6,1,4,0,...,5,4,14,2,34,890.0,1,33,766.0,2.81
66,2.82,2.83,2.45,2.52,2.51,2.63,6,1,4,3,...,4,4,71,2,3,985.0,1,20,866.0,2.75


In [35]:
X_ = X[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 'TOTAL_SEM', 'GENDER', 
        'BATCH', 'CAMPUS', 'PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 'SCHOOL', 'SEC_GRADE', 'HIGHER_SECONDARY', 
        'COLLEGE', 'HIG_SEC_GRADE']]

test_sem_8 = X_test[X_test["TOTAL_SEM"] == 7]
test_sem_8 = test_sem_8[['SEM_1_CGPA', 'SEM_2_CGPA', 'SEM_3_CGPA', 'SEM_4_CGPA', 'SEM_5_CGPA', 'SEM_6_CGPA', 'SEM_7_CGPA', 
                         'TOTAL_SEM', 'GENDER', 'BATCH', 'CAMPUS','PROG_CODE', 'FIRST_SEM', 'LAST_SEM', 'CITY', 'SECONDARY', 
                         'SCHOOL','SEC_GRADE', 'HIGHER_SECONDARY', 'COLLEGE', 'HIG_SEC_GRADE']]

print("\nPredict CGPA of 8th Semester\nTotal Records:", len(test_sem_8))

model = LinearRegression()
model.fit(X_, y)
y_pred = np.round(model.predict(test_sem_8), 2)
test_sem_8['SEM_8_CGPA'] = y_pred
test_sem_8.head()


Predict CGPA of 8th Semester
Total Records: 1218


Unnamed: 0,SEM_1_CGPA,SEM_2_CGPA,SEM_3_CGPA,SEM_4_CGPA,SEM_5_CGPA,SEM_6_CGPA,SEM_7_CGPA,TOTAL_SEM,GENDER,BATCH,...,FIRST_SEM,LAST_SEM,CITY,SECONDARY,SCHOOL,SEC_GRADE,HIGHER_SECONDARY,COLLEGE,HIG_SEC_GRADE,SEM_8_CGPA
12,2.63,2.49,2.3,2.53,2.48,2.58,2.51,7,1,4,...,4,4,49,0,66,709.0,1,33,900.0,2.58
19,3.33,3.31,3.29,3.16,3.26,3.29,3.27,7,1,4,...,4,4,49,0,66,765.0,0,39,905.0,3.27
20,2.67,2.59,2.47,2.42,2.29,2.29,2.32,7,1,4,...,4,4,49,2,47,972.0,1,33,929.0,2.39
32,2.39,1.94,2.02,2.01,2.08,2.1,2.12,7,1,4,...,11,4,73,2,3,829.0,1,30,784.0,2.21
35,2.14,2.43,2.25,2.57,2.64,2.66,2.82,7,1,4,...,4,4,49,2,47,1019.0,1,28,857.0,2.86
