## Data Preprocessing and Logistic regression to find whether the student gets admitted or not.
GPA below 2.5 -> not admitted

GPA above 2.5 -> admitted

### Importing the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
original_data=pd.read_csv('student_performance.csv')
data=original_data.copy()

data.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


In [3]:
columns_to_drop=['StudentID','GradeClass']   #dropping columns
data=data.drop(columns_to_drop,axis=1)

data.shape

(2392, 13)

In [4]:
admitted=[]                       #adding column stating whether student gets admitted or not.
for i in range(2392):
    if data['GPA'][i]>=2.5:
        admitted.append(1)
    else:
        admitted.append(0)

data['GPA']=admitted
data=data.rename(columns={'GPA':'admitted'})
data.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,admitted
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,1
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,1
2,15,0,2,3,4.21057,26,0,2,0,0,0,0,0
3,17,1,0,3,10.028829,14,0,3,1,0,0,0,0
4,17,1,0,2,4.672495,17,1,3,0,0,0,0,0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                2392 non-null   int64  
 1   Gender             2392 non-null   int64  
 2   Ethnicity          2392 non-null   int64  
 3   ParentalEducation  2392 non-null   int64  
 4   StudyTimeWeekly    2392 non-null   float64
 5   Absences           2392 non-null   int64  
 6   Tutoring           2392 non-null   int64  
 7   ParentalSupport    2392 non-null   int64  
 8   Extracurricular    2392 non-null   int64  
 9   Sports             2392 non-null   int64  
 10  Music              2392 non-null   int64  
 11  Volunteering       2392 non-null   int64  
 12  admitted           2392 non-null   int64  
dtypes: float64(1), int64(12)
memory usage: 243.1 KB


In [6]:
#let us seaparate the numerical data and scale it and keep.
data.columns

Index(['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering', 'admitted'],
      dtype='object')

In [7]:
inputs=data[['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering']]
inputs_to_be_scaled=data[['Age','Gender', 'ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering']]
categorical_inputs=data['Ethnicity']
targets=data['admitted']

In [8]:
scaler=StandardScaler()     #scaling the data.
scaler.fit(inputs_to_be_scaled)
scaled_inputs=scaler.transform(inputs_to_be_scaled)

In [9]:
new_data=pd.DataFrame(scaled_inputs,columns=['Age','Gender', 'ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering'])
new_data.head()

Unnamed: 0,Age,Gender,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,0.472919,0.978492,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866
1,1.362944,-1.021981,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866
2,-1.307132,-1.021981,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866
3,0.472919,0.978492,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866
4,0.472919,0.978492,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866


In [10]:
#getting dummies for ethnicity
race=pd.get_dummies(categorical_inputs)
race=race.drop([3],axis=1)
race=race.rename(columns={0:'Caucasian',1:'AfriAmeri',2:'Asian'})
race=race.astype(int)
race.head()

Unnamed: 0,Caucasian,AfriAmeri,Asian
0,1,0,0
1,1,0,0
2,0,0,1
3,1,0,0
4,1,0,0


In [11]:
new_data=pd.concat([new_data,race],axis=1)

new_data=new_data[['Age','Gender','Caucasian' ,'AfriAmeri','Asian','ParentalEducation', 'StudyTimeWeekly',
       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular', 'Sports',
       'Music', 'Volunteering']]

new_data.head()

Unnamed: 0,Age,Gender,Caucasian,AfriAmeri,Asian,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,0.472919,0.978492,1,0,0,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866
1,1.362944,-1.021981,1,0,0,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866
2,-1.307132,-1.021981,0,0,1,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866
3,0.472919,0.978492,1,0,0,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866
4,0.472919,0.978492,1,0,0,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866


In [12]:
new_data.shape

(2392, 14)

### Splitting data into training, testing data

In [13]:
x_train,x_test,y_train,y_test=train_test_split(new_data,targets,test_size=0.2,shuffle=True,random_state=365)

## Regression

In [14]:
reg=LogisticRegression()

In [15]:
reg.fit(x_train,y_train)

In [16]:
reg.score(x_train,y_train)*100

94.8771562990068

In [17]:
predicted=reg.predict(x_train)      #obtaining the confusion matrix.
actual=targets
true_pos=0
false_pos=0
true_neg=0
false_neg=0
for i in range(1913):
    if predicted[i]==0 and actual[i]==1:
        false_pos+=1
    if predicted[i]==1 and actual[i]==0:
        false_neg+=1
    if predicted[i]==0 and actual[i]==0:
        true_neg+=1
    if predicted[i]==1 and actual[i]==1:
        true_pos+=1
pred_table=pd.DataFrame(data=[[true_neg,false_pos],[false_neg,true_pos]],columns=['0','1'],index=['0','1'])
pred_table

Unnamed: 0,0,1
0,954,409
1,394,156


### Testing

In [18]:
reg.score(x_test,y_test)*100

94.36325678496868

### Saving as npz for NN

In [19]:
new_data.head()

Unnamed: 0,Age,Gender,Caucasian,AfriAmeri,Asian,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
0,0.472919,0.978492,1,0,0,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866
1,1.362944,-1.021981,1,0,0,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866
2,-1.307132,-1.021981,0,0,1,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866
3,0.472919,0.978492,1,0,0,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866
4,0.472919,0.978492,1,0,0,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866


In [20]:
targets

0       1
1       1
2       0
3       0
4       0
       ..
2387    1
2388    1
2389    0
2390    0
2391    0
Name: admitted, Length: 2392, dtype: int64

In [21]:
np.sum(targets)

706

In [22]:
len(targets)

2392

### Balancing the data

In [23]:
new_data['admitted']=targets
new_data.head()

Unnamed: 0,Age,Gender,Caucasian,AfriAmeri,Asian,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,admitted
0,0.472919,0.978492,1,0,0,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866,1
1,1.362944,-1.021981,1,0,0,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866,1
2,-1.307132,-1.021981,0,0,1,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866,0
3,0.472919,0.978492,1,0,0,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866,0
4,0.472919,0.978492,1,0,0,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866,0


In [24]:
count=0
indices_to_delete=[]
for i in range(new_data.shape[0]):
    if(new_data['admitted'][i]==0):
        count+=1
    if(count>710 and new_data['admitted'][i]==0):
        indices_to_delete.append(i)

len(indices_to_delete)

976

In [25]:
new_balanced_data = np.delete(new_data, indices_to_delete, axis=0)
final_data=pd.DataFrame(new_balanced_data)
final_data.columns=['Age', 'Gender', 'Caucasian', 'AfriAmeri', 'Asian',
       'ParentalEducation', 'StudyTimeWeekly', 'Absences', 'Tutoring',
       'ParentalSupport', 'Extracurricular', 'Sports', 'Music',
       'Volunteering', 'admitted']
final_data.head()

Unnamed: 0,Age,Gender,Caucasian,AfriAmeri,Asian,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,admitted
0,0.472919,0.978492,1.0,0.0,0.0,0.253711,1.780336,-0.890822,1.522371,-0.108744,-0.788476,-0.660132,2.019544,-0.431866,1.0
1,1.362944,-1.021981,1.0,0.0,0.0,-0.746087,0.997376,-1.717694,-0.65687,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866,1.0
2,-1.307132,-1.021981,0.0,0.0,1.0,1.253509,-0.984045,1.353542,-0.65687,-0.108744,-0.788476,-0.660132,-0.495161,-0.431866,0.0
3,0.472919,0.978492,1.0,0.0,0.0,1.253509,0.045445,-0.063951,-0.65687,0.782063,1.268269,-0.660132,-0.495161,-0.431866,0.0
4,0.472919,0.978492,1.0,0.0,0.0,0.253711,-0.902311,0.290422,1.522371,0.782063,-0.788476,-0.660132,-0.495161,-0.431866,0.0


In [26]:
final_inputs=final_data.iloc[:,0:-1]
final_targets=final_data.iloc[:,-1:]

### Lets try the regression after balancing and check for improvements.

In [27]:
reg=LogisticRegression()
reg.fit(final_inputs,final_targets)

  y = column_or_1d(y, warn=True)


In [28]:
reg.score(final_inputs,final_targets)

0.9413841807909604

In [29]:
reg.score(x_test,y_test)

0.9436325678496869

### Not much improvement. Now we will try NN.

In [30]:
train = int(0.8 * final_inputs.shape[0])
test = int(0.1 * final_inputs.shape[0])

In [31]:
train_inputs=final_inputs.iloc[0:train,:]
train_targets=final_targets.iloc[0:train,:]

In [32]:
val_inputs=final_inputs.iloc[train:train+test,:]
val_targets=final_targets.iloc[train:train+test,:]
val_inputs

Unnamed: 0,Age,Gender,Caucasian,AfriAmeri,Asian,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering
1132,1.362944,0.978492,1.0,0.0,0.0,-1.745885,-1.695312,-1.599569,1.522371,-0.999551,1.268269,1.514848,-0.495161,-0.431866
1133,-1.307132,-1.021981,1.0,0.0,0.0,0.253711,-0.303311,-0.890822,-0.656870,-0.108744,-0.788476,1.514848,-0.495161,-0.431866
1134,-1.307132,-1.021981,1.0,0.0,0.0,0.253711,0.191952,-1.717694,-0.656870,0.782063,1.268269,-0.660132,2.019544,-0.431866
1135,-1.307132,-1.021981,0.0,1.0,0.0,1.253509,-0.498371,-0.300200,-0.656870,0.782063,1.268269,1.514848,-0.495161,-0.431866
1136,0.472919,-1.021981,0.0,0.0,1.0,0.253711,1.318682,-0.182076,-0.656870,-0.108744,1.268269,1.514848,-0.495161,-0.431866
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,0.472919,-1.021981,0.0,1.0,0.0,0.253711,0.930995,-1.599569,-0.656870,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866
1269,-0.417106,0.978492,1.0,0.0,0.0,-0.746087,0.892422,-1.127071,1.522371,0.782063,-0.788476,-0.660132,-0.495161,2.315535
1270,1.362944,0.978492,0.0,1.0,0.0,0.253711,1.046033,-1.599569,1.522371,1.672869,-0.788476,1.514848,-0.495161,-0.431866
1271,-1.307132,-1.021981,0.0,0.0,0.0,-1.745885,0.972064,-0.890822,-0.656870,-0.999551,-0.788476,-0.660132,-0.495161,-0.431866


In [33]:
test_inputs=final_inputs.iloc[train+test:,:]
test_targets=final_targets.iloc[train+test:,:]

In [34]:
np.savez('Student_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Student_data_validation', inputs=val_inputs, targets=val_targets)
np.savez('Student_data_test', inputs=test_inputs, targets=test_targets)