<H1> The algorithm predicting graduate admisstion </H1>

Used dataset: https://www.kaggle.com/mohansacharya/graduate-admissions

In [None]:
# Import popular modules
import numpy as np
import pandas as pd

np.random.seed(42)

# Data visualization
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
import seaborn as sns
from pandas.plotting import scatter_matrix

# Data preparation
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

# Model selection 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [None]:
csv_name = 'Graduate_Admission.csv'
df = pd.read_csv(csv_name)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.rename(columns = {'GRE Score': 'GRE_Score', 'TOEFL Score' : 'TOEFL_Score', 'University Rating' : 'University_Rating', 
                    'Chance of Admit ':'Chance of Admit' }, inplace = True)
df['Chance of Admit'] = df['Chance of Admit'] * 100.00 

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.index = df['Serial No.'].values
df.drop('Serial No.', axis = 1, inplace = True)

In [None]:
df.describe()

In [None]:
df.hist(bins = 50, figsize = (20,15))

<H1> Correlation </H1>  

In [None]:
corr_matrix = df.corr()

In [None]:
corr_matrix['Chance of Admit'].sort_values(ascending = False)

In [None]:
sns.heatmap(corr_matrix, annot=True)

From above we can notice that there is strong correlation mostly between:
+ <b> TOEFL_Score </b> and <b> GRE_Score </b>    
+ <b> CGPA </b> and <b> GRE_Score </b>  

There is no negative correlated attributes 

In [None]:
attribs = ['GRE_Score', 'TOEFL_Score', 'University_Rating', 'SOP', 'LOR ', 'CGPA']
scatter_matrix(df[attribs], figsize = (20, 15))

#or another way to show the correlation
sns.pairplot(df, hue='Research')

<H1> Data Preparation </H1>

In [None]:
std_scaler = StandardScaler()

In [None]:
std_scaler.fit(df)

In [None]:
df_scaled = std_scaler.transform(df)

In [None]:
df_scaled = pd.DataFrame(df_scaled, columns = df.columns)

In [None]:
df_scaled.head(20)

<H1> Split set into train and test </H1>

In [None]:
X = df_scaled.iloc[:,:-1]
y = df_scaled.iloc[:,-1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state=42)

In [None]:
print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")

<H1> Model - Linear Regression </H1>

In [None]:
lin_reg = LinearRegression()

In [None]:
lin_reg.fit(X_train, y_train)

In [None]:
lin_pred = lin_reg.predict(X_test)

In [None]:
mse = metrics.mean_squared_error(y_test, lin_pred)

In [None]:
mse

In [None]:
user_input = [[100, 5, 5, 5, 5, 9.65, 1]] #user input from testing data
user_pred = lin_reg.predict(user_input)
print((user_pred.flatten()).astype(float))

<H1> Model - Random Forest </H1>

In [None]:
for_regr = RandomForestRegressor(random_state=0)

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [5,6,7],
}

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = for_regr, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2, scoring="r2")

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

<b> Best set of hiperparameters </b>

In [None]:
from prettytable import PrettyTable
table = PrettyTable()
table.field_names = ["Parameter", "Value"] 

for param, val in grid_search.best_params_.items():
    table.add_row([param, val])

print(table)

In [None]:
# retrain the model with best parameters
for_regr = RandomForestRegressor(max_depth = grid_search.best_params_['max_depth'], 
                                 max_features = grid_search.best_params_['max_features'], 
                                 n_estimators = grid_search.best_params_['n_estimators'])

In [None]:
for_regr.fit(X_train, y_train)

In [None]:
for_regr_prediction = for_regr.predict(X_test)

In [None]:
fro_regr_mse = metrics.mean_squared_error(y_test, for_regr_prediction)

In [None]:
fro_regr_mse