In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Let's Explore the data

In [None]:
df = pd.read_csv('/kaggle/input/graduates-admission-prediction/admission_data.csv')
df.head()

In [None]:
df.columns

In [None]:
#Let's rename few columns
df.rename(columns = {'GRE Score':'GRE_Score', 'TOEFL Score':'TOEFL_Score', 'University Rating':'University_Rating'}, inplace = True)

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
#Exploring variables

df['Chance of Admit '].value_counts().head(20).plot(kind = 'pie', figsize = (6,6))


In [None]:
#Checking relation between Chances of admit and CGPA
df.plot(x='Chance of Admit ', y='CGPA', kind="hist", figsize=(9, 8), color = 'lightcoral')
plt.show()

In [None]:
#Checking relation between Chances of admit and SOP
df.plot(x='Chance of Admit ', y='SOP', kind="hist", figsize=(9, 8), color = 'olive')
plt.show()

The plot depicts that lower the value of SOP, lower is the Chance of Admit

In [None]:
#Checking relation between Chances of admit and Gre Score
df.plot(x='Chance of Admit ', y='GRE_Score', kind="hist", figsize=(9, 8), color = 'coral')
plt.show()

In [None]:
#Checking relation between Chances of admit and TOEFL Score
df.plot(x='Chance of Admit ', y='TOEFL_Score', kind="hist", figsize=(9, 8), color = 'peru')
plt.show()

In [None]:
df.plot.scatter('Chance of Admit ','GRE_Score', color = 'darkturquoise')

In [None]:
df.plot.scatter('Chance of Admit ','GRE_Score', color = 'teal')

In [None]:
df.plot.scatter('Chance of Admit ','Research', color = 'forestgreen')

Research experience seems to have moderate to high impact on Admit chances.

In [None]:
#fig, ax = plt.subplots()

#ax.plot(df['Chance of Admit '], df['TOEFL_Score'], color = 'goldenrod')
#ax.plot(df['Chance of Admit '], df['GRE_Score'], color = 'darkturquoise')
#plt.show()

**From Data Analysis, we can say that a Good to moderate GRE and TOEFL score along with research experience can land students with an admit.**

# Data Preprocessing

In [None]:
#Split data into values and labels (Y = Mx + b) M = weights, b = Bias

x = df.drop('Chance of Admit ', axis = 1)
y = df['Chance of Admit ']

In [None]:
#Let's make data workable i.e scale the data with simple scaler
# This gives R2 score of about 0.77

#from sklearn import preprocessing
#x = preprocessing.scale(x)

In [None]:
#Now try scaling with MinMaxScaler
#This helps to increase R2 score (~0.8)
from sklearn.preprocessing import MinMaxScaler

Scale  = MinMaxScaler()

features = ['GRE_Score', 'TOEFL_Score', 'University_Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit ']

df[features] = Scale.fit_transform(df[features])

df[features]

# Train_Test_Split

In [None]:
#Split data into train and test data for both x and y
from sklearn.model_selection import train_test_split

# 80% train data and 20% test data
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
y_train.shape

# Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

# Prediction Metrics and plots

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

**A R2 Score of 1.0 indicates perfect fit i.e a perfect straight line**. 

Here, we got R2 score near 0.8 that is still pretty good fit to a straight line.

In [None]:
# Plotting scatter plot with best fit line to check fit.
plt.scatter(y_test,y_pred)

#obtain m (slope) and b(intercept) of linear regression line
m, b = np.polyfit(y_test, y_pred, 1)

#Plotting best fit line
plt.plot(y_test, m*y_test+b, color = 'red')

The fit turns about to be approximately a straight line.

**Thank You :)**

*Please feel free to share your feedbacks and suggestions in the comments :)*