In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Please do upvote if you have learned something from this notebook :)**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/kaggle/input/graduate-admissions/Admission_Predict.csv')
data.head()

In [None]:
#Drop the serial column
data = data.drop('Serial No.', axis = 1)
data.head()

In [None]:
#Describe the dataframe.
data.info()

In [None]:
#Dataset has no null values.
bool_data = data.isnull()
for col in list(data.columns):
    print(col + ': ' + str(bool_data[col].sum()))

# **GRE Score vs Chance of Admit**

In [None]:
sns.regplot(x = data['Chance of Admit '], y = data['GRE Score'])

We can clearly see that the candidates with high GRE scores have a good chance of getting admitted in the universities with high ratings. But there are some candidates that have less chance of getting admitted as University look for many other things in a candidate apart from the GRE Score. But good GRE score have helped majority of them.

In [None]:
sns.regplot(x = data['Chance of Admit '], y = data['TOEFL Score'])

Same case with the TOEFL Scores as like the GRE Scores!! Hence candidates form countries that donot have English as their national language should have a decent GRE, TOEFL, or IELTS score to get into top universities world-wide.

# **Does SOP and LOR even matter??**

In [None]:
#SOP vs Chance of Admit plot.
sns.boxplot('SOP', 'Chance of Admit ', data = data)

In [None]:
#LOR vs Chance of Admit.
sns.boxplot('LOR ', 'Chance of Admit ', data = data)

Yeah!! SOP -> Statement of Purpose, basically we have to describe why do we wanna study in that country and why did you select that university in particular. This basically tells how serious and motivated you are with respect to your career. LOR -> Letter of Recommendation, is a document designed to add extra weight and merit to a job or college application. They are usually written by a supervisor, colleague, teacher, or friend.

So there is no doubt that these two features will have significant impact on the outcome.

# **CGPA vs Chance of Admit**

In [None]:
sns.regplot(x = data['Chance of Admit '], y = data['CGPA'])

In [None]:
sns.swarmplot(x = data['Research'], y = data['Chance of Admit '])

It could be clearly seen that prefer candiates with resaerch experience have a slight edge over those who donot have.

# **Feature Correlation**

In [None]:
corr_data = data.corr()
figure = plt.figure()
plt.figure(figsize = (8, 8))
sns.heatmap(corr_data, cmap = 'YlGnBu', annot = True)

In [None]:
dict_corr = dict(corr_data['Chance of Admit '])
sorted_list = sorted(dict_corr.items(), key = lambda kv:(kv[1], kv[0]))

#Print the features from the most influencing one to the least influencing one.
for feature, value in sorted_list[-2::-1]:
    print(feature + ' --> ' + str(value))

CGPA has more impact on the outcome that if the candidate will get selected or not. So maintaining a good GPA in necessary. 

In [None]:
#linear regression, rmse, train_test_split, MinMax Scalar.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor 

# **Linear Regression**

In [None]:
#split the data.
values = data['Chance of Admit ']
train_data = data.drop('Chance of Admit ', axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(train_data, values, test_size = 0.25)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

#Scale the data.
scaler = MinMaxScaler(feature_range = (0, 1))
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

model = LinearRegression()
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

import numpy as np
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test, Y_pred)))

In [None]:
results_df = pd.DataFrame({'Predicted' : Y_pred.tolist(), 'Actual' : Y_test.tolist()}) #Dataframe haviing predicted and actual values.

sns.scatterplot(x = results_df['Actual'], y = results_df['Predicted'])

In [None]:
corr_df = results_df.corr() #slope of that reg line is about 0.91. It should have been 1. More less than 1 means the error is high.
print(corr_df)
sns.regplot(x = results_df['Actual'], y = results_df['Predicted'])

# **Random Forest for Linear Regression**

In [None]:
vlues = data['Chance of Admit ']
train_data = data.drop('Chance of Admit ', axis = 1)

X_train, X_test, Y_train, Y_test = train_test_split(train_data, values, test_size = 0.25)
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

forest = RandomForestRegressor(n_estimators = 250, random_state = 20) 

forest.fit(X_train, Y_train)

Y_pred=forest.predict(X_test)

print('Root Mean Squared Error:', np.sqrt(mean_squared_error(Y_test, Y_pred)))

In [None]:
results_df = pd.DataFrame({'Predicted' : Y_pred.tolist(), 'Actual' : Y_test.tolist()})
sns.scatterplot(x = results_df['Actual'], y = results_df['Predicted'])

In [None]:
corr_df = results_df.corr()
print(corr_df)
sns.regplot(x = results_df['Actual'], y = results_df['Predicted'])

# **Summary**
**Linear regression -> RMSE 0.069**

**Random Forest for Regression -> RMSE 0.066**

Random Forset with 250 decision trees has made better predictions than a simple Linear Regression model.