In [1]:
"""
The goal of this experiment is to develop a model that will predict (to a certain level of acceptable accuracy) the students final GPAs, given the current progress.
"""

'\nThe goal of this experiment is to develop a model that will predict (to a certain level of acceptable accuracy) the students final GPAs, given the current progress.\n'

In [2]:
""" Import helper libraries """
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import metrics
from sklearn import neighbors
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [3]:

""" Split Data into Training and Testing Sets """
def split_data(X, Y):
    return train_test_split(X, Y, test_size=0.2, random_state=17)

import numpy as np 

def matrix_factorization(X, U, V, K, steps=5000, alpha=0.0002, beta=0.02):
    V = V.T
    for step in range(steps):
        for i in range(len(X)):
            for j in range(len(X[i])):
                if X[i][j] > 0:
                    eij = X[i][j] - np.dot(U[i,:],V[:,j])
                    for k in range(K):
                        U[i][k] = U[i][k] + alpha * (2 * eij * V[k][j] - beta * U[i][k])
                        V[k][j] = V[k][j] + alpha * (2 * eij * U[i][k] - beta * V[k][j])
        eX = np.dot(U, V)
        e = 0
        for i in range(len(X)):
            for j in range(len(X[i])):
                if X[i][j] > 0:
                    e = e + pow(X[i][j] - np.dot(U[i,:],V[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(U[i][k],2) + pow(V[k][j],2))
        if e < 0.001:
            break
    
    return U.T, V

def obtain_course_cluster_matrix(X):
    N = len(X)
    M = len(X[0])

    P = np.random.rand(N,K)
    Q = np.random.rand(M,K)

    U, V = matrix_factorization(X, P, Q, K)
        
    for c in range(len(courses)):
        cluster = 0
        for curr in range(K):
            if V[curr][c] >= cluster:
                cluster = curr
        course_cluster[cluster].append(courses[c])
        
    return course_cluster


def calculate_num_of_courses(semester):
    start = -1
    end = -1
    if semester == 1:
        start = 0
        end = 5
    elif semester == 2:
        start = 6
        end = 13
    elif semester == 3:
        start = 14
        end = 21
    elif semester == 4:
        start = 22
        end = 29
    elif semester == 5:
        start = 30
        end = 37
    elif semester == 6:
        start = 38
        end = 38
    return start, end


def cal_index_for_course(semester):
    if semester == 1:
        return 0
    elif semester == 2:
        return 6
    elif semester == 3:
        return 14
    elif semester == 4:
        return 22
    elif semester == 5:
        return 30
    elif semester == 6:
        return 38

""" Extract feature (X) for target (y) column """
def extract_features_cols(semester, course):  
    feature_cols = []
    e_index = cal_index_for_course(semester)
    
    if(semester == 1):
        return feature_cols
    
    for cluster in range(K):  # to find the cluster in which our course is.
        if course in course_cluster[cluster]:
            index = cluster
            break
    
    my_cluster = course_cluster[index]
    
    for c in range(e_index):
        if courses[c] in my_cluster:
            feature_cols.append(courses[c])
            
    return feature_cols


def random_forest_train(df, semester):
    """
    To train and test accuracy using the random forest regressor with the datasets
    :param semester:
    :param data frame:
    """
    print("\n\n*** Random Forest: ***\n")
    
    seed = 7
    num_trees = 100
    s_index = cal_index_for_course(semester)
    feature_cols = list(df.columns[:4])
    
    for i in range(num_of_courses[semester-1]):
        course = courses[s_index + i]
        feature_cols.extend(extract_features_cols(semester, course))
        target_col = course
        print("Featured_cols:  \n", feature_cols)
        print("Target__ cols:  \n", target_col)
             
        X = df[feature_cols]  # feature values for all students
        y = df[target_col]  # corresponding targets/labels
        
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        
        X_train, X_test, y_train, y_test = split_data(X, y)
    
        sc_x = StandardScaler()
        X_train = sc_x.fit_transform(X_train)
        X_test = sc_x.transform(X_test)
        sc_y = StandardScaler()
        y_train = np.reshape(y_train, (-1, 1))
        y_train = sc_y.fit_transform(y_train)
        y_test = np.reshape(y_test, (-1, 1))
        y_test = sc_y.transform(y_test)

        regressor = RandomForestRegressor(n_estimators = num_trees, random_state = seed, oob_score = True, cv = kfold)
        regressor.fit(X_train, y_train)

        """ R^2 (coefficient of determination) regression score function """
        y_pred = regressor.predict(X_test)
        print("r2 score: ", r2_score(y_test , y_pred))

#         accuracies = cross_val_score(estimator = regressor, X = x_train, y = y_train, cv = 10)
#         print("Accuracies.mean: ", accuracies.mean())
#         print("S.d.: ", accuracies.std())
        """ Mean squared error regression loss """
        mse_test = mean_squared_error(y_test, y_pred)
        print("Mean squared error: ", mse_test)
        
#         """  Mean squared logarithmic regression loss """
#         mse_test = mean_squared_log_error(y_test, y_pred)
#         print("Mean squared logarithmic error: ", mse_test)
        
       
        # instantiate a random forest model
#         model = RandomForestRegressor(n_estimators=num_trees)
#         results = model_selection.cross_val_score(model, X, y, cv=kfold)
#         # check the accuracy on the training set
#         print("accuracy:", results.mean())
        
        feature_cols = list(df.columns[:4])
        
        print("\n\n")
    
    
""" kNN Algorithm for training the model using datasets """
def knn_train(df, semester):
    """
    To train the model using the kNN algorithm with the datasets
    :param data frame:
    :param semester:
    """
    
    print("\n\n*** KNN ***:\n")
    s_index = cal_index_for_course(semester)
    feature_cols = list(df.columns[:4])  #initialise with static features
    
    for i in range(num_of_courses[semester-1]):
        course = courses[s_index + i]
        feature_cols.extend(extract_features_cols(semester, course))
        target_col = course
        print("Featured_cols:  \n", feature_cols)
        print("Target__ cols:  \n", target_col)
        
        
        X = df[feature_cols]  # feature values for all students
        y = df[target_col]  # corresponding targets/labels
          
        X_train, X_test, y_train, y_test = split_data(X, y)
    
        # instantiate a kNN regression model, and fit with X and y
        model = neighbors.KNeighborsRegressor()
        model = model.fit(X_train, y_train)
        
        sc_x = StandardScaler()
        X_train = sc_x.fit_transform(X_train)
        X_test = sc_x.transform(X_test)
        sc_y = StandardScaler()
        y_train = np.reshape(y_train, (-1, 1))
        y_train = sc_y.fit_transform(y_train)
        y_test = np.reshape(y_test, (-1, 1))
        y_test = sc_y.transform(y_test)

        regressor = neighbors.KNeighborsRegressor()
        regressor = regressor.fit(X_train, y_train)

    
        """ R^2 (coefficient of determination) regression score function """
        y_pred = regressor.predict(X_test)
        print("r2 score: ", r2_score(y_test , y_pred))

        """ Mean squared error regression loss """
        mse_test = mean_squared_error(y_test, y_pred)
        print("Mean squared error: ", mse_test)
        
#         """  Mean squared logarithmic regression loss """
#         mse_test = mean_squared_log_error(y_test, y_pred)
#         print("Mean squared logarithmic error: ", mse_test)
        
        feature_cols = list(df.columns[:4])
        print("\n\n")

        
""" SVM Algorithm for training the model using datasets """
def svm_train(df, semester):
    """
    To train the model using the kNN algorithm with the datasets
    :param data frame:
    :param semester:
    """
    
    print("\n\n*** SVM ***:\n")
    s_index = cal_index_for_course(semester)
    feature_cols = list(df.columns[:4])  #initialise with static features
    for i in range(num_of_courses[semester-1]):
        course = courses[s_index + i]
        feature_cols.extend(extract_features_cols(semester, course))
        target_col = course
        print("Featured_cols:  \n", feature_cols)
        print("Target__ cols:  \n", target_col)
        
        
        X = df[feature_cols]  # feature values for all students
        y = df[target_col]  # corresponding targets/labels
          
        X_train, X_test, y_train, y_test = split_data(X, y)
    
        # instantiate a kNN regression model, and fit with X and y
        model = neighbors.KNeighborsRegressor()
        model = model.fit(X_train, y_train)
        
        sc_x = StandardScaler()
        X_train = sc_x.fit_transform(X_train)
        X_test = sc_x.transform(X_test)
        sc_y = StandardScaler()       
        y_train = np.reshape(y_train, (-1, 1))
        y_train = sc_y.fit_transform(y_train)
        y_test = np.reshape(y_test, (-1, 1))
        y_test = sc_y.transform(y_test)

        regressor = svm.SVR(C=1.0, epsilon=0.2)
        regressor = regressor.fit(X_train, y_train)

    
        """ R^2 (coefficient of determination) regression score function """
        y_pred = regressor.predict(X_test)
        print("r2 score: ", r2_score(y_test , y_pred))

        """ Mean squared error regression loss """
        mse_test = mean_squared_error(y_test, y_pred)
        print("Mean squared error: ", mse_test)
        
#         """  Mean squared logarithmic regression loss """
#         mse_test = mean_squared_log_error(y_test, y_pred)
#         print("Mean squared logarithmic error: ", mse_test)
        
        feature_cols = list(df.columns[:4])
        print("\n\n")

        feature_cols = list(df.columns[:4])        
        print("\n\n")
        

def linear_regression_train(semester, df):
    """
    To train and test accuracy using the linear regression with the datasets
    :param semester:
    :param data frame:
    """
    
    print("\n\n*** Linear Regression: ***\n")
    
    s_index = cal_index_for_course(semester)
    feature_cols = list(df.columns[:4])
    
    for i in range(num_of_courses[semester-1]):
        course = courses[s_index + i]
        feature_cols.extend(extract_features_cols(semester, course))
        target_col = course
        print("Featured_cols:  \n", feature_cols)
        print("Target__ cols:  \n", target_col)
        
        
        X = df[feature_cols]  # feature values for all students
        y = df[target_col]  # corresponding targets/labels
          
        X_train, X_test, y_train, y_test = split_data(X, y)
    
        # instantiate a kNN regression model, and fit with X and y
        model = neighbors.KNeighborsRegressor()
        model = model.fit(X_train, y_train)

        print(model.predict(X_test))
        
        sc_x = StandardScaler()
        X_train = sc_x.fit_transform(X_train)
        X_test = sc_x.transform(X_test)
        sc_y = StandardScaler()
        y_train = np.reshape(y_train, (-1, 1))
        y_train = sc_y.fit_transform(y_train)
        y_test = np.reshape(y_test, (-1, 1))
        y_test = sc_y.transform(y_test)

        regressor = LinearRegression()
        regressor = regressor.fit(X_train, y_train)

    
        """ R^2 (coefficient of determination) regression score function """
        y_pred = regressor.predict(X_test)
        print("r2 score: ", r2_score(y_test , y_pred))

        """ Mean squared error regression loss """
        mse_test = mean_squared_error(y_test, y_pred)
        print("Mean squared error: ", mse_test)
        
#         """  Mean squared logarithmic regression loss """
#         mse_test = mean_squared_log_error(y_test, y_pred)
#         print("Mean squared logarithmic error: ", mse_test)
        
        feature_cols = list(df.columns[:4])
        print("\n\n")

        feature_cols = list(df.columns[:4])        
        print("\n\n")

In [4]:
def main():
    
    """ Read data file as DataFrame """
    df = pd.read_csv("dataset.txt", sep=",")
    
    courses_df = df.loc[:, df.columns.isin(list(courses))]
    
    X = courses_df.as_matrix()
    
    semester = K
    
    course_cluster = obtain_course_cluster_matrix(X)
    
    print("Course Cluster:\n")    
    for item in course_cluster:
        print(item)
    
    print("\n\n")
    
    start, end = calculate_num_of_courses(semester)
    
    svm_train(df, semester)    
    knn_train(df, semester)
    linear_regression_train(df, semester)
    random_forest_train(df, semester)
    

In [5]:
# List of all courses offered in MCA program 
courses = ['psqt_gpa', 'me_gpa', 'co_gpa', 'pscp_gpa', 'pscp_lab_gpa', 'foss_lab_gpa', 
           'afm_gpa', 'ds_gpa', 'ss_gpa', 'oops_gpa', 'aad_gpa', 'ds_lab_gpa', 'ss_lab_gpa', 'oops_lab_gpa', 
           'os_gpa', 'dbms_gpa', 'wt_gpa', 'ospm_gpa', 'ads_gpa', 'os_lab_gpa', 'dbms_lab_gpa', 'wt_lab_gpa', 
           'pdwdm_gpa', 'ccn_gpa', 'se_gpa', 'ooad_gpa', 'dos_gpa', 'ke_lab_gpa', 'ccn_lab_gpa', 'set_lab_gpa', 
           'nps_gpa', 'uc_gpa', 'cc_gpa', 'dpsa_gpa', 'stt_gpa', 'seminar_viva_gpa', 'nps_lab_gpa', 'stt_lab_gpa', 
           'project_gpa']

num_of_courses = [6, 8, 8, 8, 8, 1] #number of courses per semester
 
K = int(input("Semester: ")) # total number of clusters

# relevant(or dependent) courses are grouped in same cluster
course_cluster=[]
for k in range(K):
    course_cluster.append([])
        
main()

Semester: 6
Course Cluster:

[]
[]
['psqt_gpa', 'co_gpa', 'pscp_gpa', 'foss_lab_gpa', 'afm_gpa', 'ds_gpa', 'ss_gpa', 'ds_lab_gpa', 'ss_lab_gpa', 'os_gpa', 'dbms_gpa', 'ads_gpa', 'os_lab_gpa', 'dbms_lab_gpa', 'pdwdm_gpa', 'ccn_gpa', 'dos_gpa', 'ke_lab_gpa', 'ccn_lab_gpa', 'nps_gpa', 'nps_lab_gpa']
['me_gpa', 'pscp_lab_gpa', 'oops_gpa', 'aad_gpa', 'oops_lab_gpa', 'se_gpa', 'ooad_gpa', 'set_lab_gpa', 'uc_gpa', 'cc_gpa', 'dpsa_gpa', 'stt_gpa', 'seminar_viva_gpa', 'project_gpa']
['ospm_gpa', 'stt_lab_gpa']
['wt_gpa', 'wt_lab_gpa']





*** SVM ***:

Featured_cols:  
 ['ten_gpa', 'twelth_gpa', 'colz_gpa', 'nimcet_marks', 'me_gpa', 'pscp_lab_gpa', 'oops_gpa', 'aad_gpa', 'oops_lab_gpa', 'se_gpa', 'ooad_gpa', 'set_lab_gpa', 'uc_gpa', 'cc_gpa', 'dpsa_gpa', 'stt_gpa', 'seminar_viva_gpa']
Target__ cols:  
 project_gpa


  return getattr(obj, method)(*args, **kwds)
  y = column_or_1d(y, warn=True)


r2 score:  0.43497136047469964
Mean squared error:  0.40287756345419645








*** KNN ***:

Featured_cols:  
 ['ten_gpa', 'twelth_gpa', 'colz_gpa', 'nimcet_marks', 'me_gpa', 'pscp_lab_gpa', 'oops_gpa', 'aad_gpa', 'oops_lab_gpa', 'se_gpa', 'ooad_gpa', 'set_lab_gpa', 'uc_gpa', 'cc_gpa', 'dpsa_gpa', 'stt_gpa', 'seminar_viva_gpa']
Target__ cols:  
 project_gpa
r2 score:  0.150354609929078
Mean squared error:  0.6058154235145385





*** Linear Regression: ***



ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().