In [1]:
#import pandas and numpy
import pandas as pd
import numpy as np

In [2]:
#Read in data and print first 5 rows, reassign a column as index
df = pd.read_csv('Admission_Predict.csv', index_col = 'Serial No.')
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
#Print all columns
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [4]:
#Print a statistical description of the dataset
df.describe()

Unnamed: 0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


# Making Predictions using two techniques
## First technique is a classification model

In [5]:
#For loop creates a new column that classifies the student as admitted (represented by '1') or not (represented by '0')
#Admission criteria is a 'Chance of admit' score of more than 0.65
admit = []
for x in df['Chance of Admit ']:
    if x > 0.65:
        ad = 1
        admit.append(ad)
    else:
        nad = 0
        admit.append(nad)
        
df['admitted'] = admit

In [6]:
#Display new dataframe
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit,admitted
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92,1
2,324,107,4,4.0,4.5,8.87,1,0.76,1
3,316,104,3,3.0,3.5,8.0,1,0.72,1
4,322,110,3,3.5,2.5,8.67,1,0.8,1
5,314,103,2,2.0,3.0,8.21,0,0.65,0


In [7]:
#Percentage of admitted student
(len(df[df['admitted']==1])/len(df))*100

69.5

## From here on, we would be working on the new dataset that has a categorical column as the dependent column.

In [8]:
#Creates a new dataframe 'df2' that excludes the'Chance of admit' column since we are not working on regression
df2 = df[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'admitted']]

In [9]:
#Split the dataset into the independent variables 'X' and the dependent variable 'y'
X = df2[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research']]
y = df2['admitted']

In [10]:
#Import library that splits the dataset into a two datasets for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [11]:
#Import a classification library. Fit the model
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs', max_iter=1000)
clf.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [12]:
#Prints the score of my model
clf.score(X_train, y_train)

0.8633333333333333

In [13]:
#Makes a list of predictions for a test dataframe
pred = clf.predict(X_test)

In [14]:
#Make a new dataframe of the predictions and actual classifications side by side
pred_d = {'predictions': pred, 'actual': y_test}
pred_df = pd.DataFrame (pred_d)
pred_df

Unnamed: 0_level_0,predictions,actual
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
210,1,1
281,1,1
34,1,1
211,1,1
94,0,0
...,...,...
315,0,1
374,1,1
381,1,1
240,0,0


In [15]:
#Includes a new column in the dataframe that tells if the prediction wss correct. '1' if the model's prediction was correct, '0' if it wasn't
pred_df['correct_pred'] = np.where(pred_df['predictions'] == pred_df['actual'], 1, 0)

In [16]:
#Display new dataframe
pred_df

Unnamed: 0_level_0,predictions,actual,correct_pred
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
210,1,1,1
281,1,1,1
34,1,1,1
211,1,1,1
94,0,0,1
...,...,...,...
315,0,1,0
374,1,1,1
381,1,1,1
240,0,0,1


In [17]:
#Import library that records the score of correct predictions. Record in percentage
from sklearn.metrics import accuracy_score
clf_pred = accuracy_score(y_test,pred)*100

## Second technique is a regression model
### We go back to use the original 'df' dataframe

In [18]:
#Displays dataframe
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit,admitted
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92,1
2,324,107,4,4.0,4.5,8.87,1,0.76,1
3,316,104,3,3.0,3.5,8.0,1,0.72,1
4,322,110,3,3.5,2.5,8.67,1,0.8,1
5,314,103,2,2.0,3.0,8.21,0,0.65,0


In [19]:
#Split the dataset into the independent variables 'X2' and the dependent variable 'y2'.
#Excludes 'admitted' column since we are not working with classification technique
#Splits the dataset into a two datasets for training and testing
X2 = df[['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research']]
y2 = df['Chance of Admit ']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.25, random_state=42, shuffle=True)

In [20]:
#Import a regression library. Fit the model
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X2_train, y2_train)

LinearRegression()

In [21]:
#Prints the score of my model
reg.score(X2_train, y2_train)

0.7958850329341665

In [22]:
#Makes a list of predictions for a test dataframe
predreg = reg.predict(X2_test)

In [23]:
#Make a new dataframe of the predictions and actual classifications side by side
pred_d2 = {'predictions': predreg, 'actual': y2_test}
pred_df2 = pd.DataFrame (pred_d2)
pred_df2

Unnamed: 0_level_0,predictions,actual
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
210,0.649161,0.68
281,0.723192,0.68
34,0.935349,0.90
211,0.821617,0.79
94,0.580105,0.44
...,...,...
315,0.633875,0.66
374,0.735395,0.79
381,0.773202,0.78
240,0.541942,0.59


In [24]:
#Records the percentage of actual admitted students
actual_admission = (len(pred_df2[pred_df2['actual']>0.65])/len(pred_df2))*100

In [25]:
#Records the percentage of predicted admitted students
predicted_admission = (len(pred_df2[pred_df2['predictions']>0.65])/len(pred_df2))*100

In [26]:
#Displays the prediction percentages of the two techniques
print('The classification prediction percetage is', format(str(clf_pred)), '%')
print('The regression prediction percetage is', format(str(predicted_admission)), '%')

The classification prediction percetage is 85.0 %
The regression prediction percetage is 71.0 %


## Recommends using a clssification algorithm. Model still needs futher hyperparameter tuining and data preprocessing