# ***UniSight***

#####

### **Importing Libraries**

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### **Loading Dataset**

In [3]:
df = pd.read_csv("dataset.csv")

In [4]:
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


### **Data Exploration**

In [5]:
df.columns.tolist()

['Serial No.',
 'GRE Score',
 'TOEFL Score',
 'University Rating',
 'SOP',
 'LOR ',
 'CGPA',
 'Research',
 'Chance of Admit ']

In [6]:
df.shape

(400, 9)

In [7]:
df.drop("Serial No.", axis=1, inplace=True)

In [8]:
df.head

<bound method NDFrame.head of      GRE Score  TOEFL Score  University Rating  SOP  LOR   CGPA  Research  \
0          337          118                  4  4.5   4.5  9.65         1   
1          324          107                  4  4.0   4.5  8.87         1   
2          316          104                  3  3.0   3.5  8.00         1   
3          322          110                  3  3.5   2.5  8.67         1   
4          314          103                  2  2.0   3.0  8.21         0   
..         ...          ...                ...  ...   ...   ...       ...   
395        324          110                  3  3.5   3.5  9.04         1   
396        325          107                  3  3.0   3.5  9.11         1   
397        330          116                  4  5.0   4.5  9.45         1   
398        312          103                  3  3.5   4.0  8.78         0   
399        333          117                  4  5.0   4.0  9.66         1   

     Chance of Admit   
0                0.92

In [9]:
df = df.rename(columns={"GRE Score": "GRE", "TOEFL Score": "TOEFL", "University Rating": "Rating", "Chance of Admit ": "Chance"})

In [10]:
df.head()

Unnamed: 0,GRE,TOEFL,Rating,SOP,LOR,CGPA,Research,Chance
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.0,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.8
4,314,103,2,2.0,3.0,8.21,0,0.65


In [11]:
df.dtypes

GRE           int64
TOEFL         int64
Rating        int64
SOP         float64
LOR         float64
CGPA        float64
Research      int64
Chance      float64
dtype: object

### **Handling NULL Values**

In [12]:
df.isnull().sum()

GRE         0
TOEFL       0
Rating      0
SOP         0
LOR         0
CGPA        0
Research    0
Chance      0
dtype: int64

In [13]:
df.rename(columns={"Rating": "Rank"})

Unnamed: 0,GRE,TOEFL,Rank,SOP,LOR,CGPA,Research,Chance
0,337,118,4,4.5,4.5,9.65,1,0.92
1,324,107,4,4.0,4.5,8.87,1,0.76
2,316,104,3,3.0,3.5,8.00,1,0.72
3,322,110,3,3.5,2.5,8.67,1,0.80
4,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...
395,324,110,3,3.5,3.5,9.04,1,0.82
396,325,107,3,3.0,3.5,9.11,1,0.84
397,330,116,4,5.0,4.5,9.45,1,0.91
398,312,103,3,3.5,4.0,8.78,0,0.67


In [14]:
df.to_csv("cleaned_dataset.csv", index=False)

### **Preparing Data**

In [12]:
X = df.drop("Chance", axis=1)
y = df["Chance"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

### **Linear Regression**

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [17]:
y_pred = lr_model.predict(X_test_scaled)

In [18]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [19]:
print(f"MAE: {mae:.4f}")
print(f"RMSE {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 0.0480
RMSE 0.0679
R2: 0.8212


### **Random Forest Regressor**

In [20]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42
)
rf_model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
y_pred = rf_model.predict(X_test)

In [22]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [23]:
print(f"MAE: {mae:.4f}")
print(f"RMSE {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 0.0500
RMSE 0.0706
R2: 0.8070


### **XGBoost Regressor**

In [24]:
xgb_model = XGBRegressor(
    n_estimators = 300,
    learning_rate = 0.05,
    max_depth = 5,
    subsample = 0.8,
    colsample_bytree = 0.8,
    random_state = 42
)
xgb_model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
y_pred = xgb_model.predict(X_test)

In [26]:
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [27]:
print(f"MAE: {mae:.4f}")
print(f"RMSE {rmse:.4f}")
print(f"R2: {r2:.4f}")

MAE: 0.0503
RMSE 0.0718
R2: 0.8004


### **Comparing Best Model**

In [28]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(
        n_estimators = 300,
        learning_rate = 0.05,
        max_depth = 5,
        subsample = 0.8,
        colsample_bytree = 0.8,
        random_state = 42,
        verbosity = 0
    )
}

In [29]:
results = []
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [30]:
for name, model in models.items():
    if name == "Linear Regression":
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results.append([name, mae, rmse, r2, y_pred])

In [31]:
results_df = pd.DataFrame(results, columns=["Model", "MAE", "RMSE", "R2", "Predictions"])
print("\n Model Comparision \n")
print(results_df[["Model", "MAE", "RMSE", "R2"]])


 Model Comparision 

               Model       MAE      RMSE        R2
0  Linear Regression  0.047957  0.067949  0.821208
1      Random Forest  0.050018  0.070591  0.807031
2            XGBoost  0.050319  0.071787  0.800438


### **Prediction**

In [32]:
def predict_admission_chance(model, scaler, gre, toefl, rating, sop, lor, cgpa, research):
    research_val = 1 if research.lower() == "yes" else 0
    features = np.array([[gre, toefl, rating, sop, lor, cgpa, research_val]])
    features_scaled = scaler.transform(features)
    prediction = model.predict(features_scaled)[0]
    return max(0, min(1, prediction))

In [33]:
chance = predict_admission_chance(lr_model, scaler, gre=325, toefl=110, rating=4, sop=4.5, lor=4.0, cgpa=9.2, research="Yes")
print(f"{chance*100:.2f}%")

84.31%




### **Saving Model**

In [34]:
import joblib

In [35]:
joblib.dump(lr_model, "lr_model.joblib")
joblib.dump(scaler, "scaler.joblib")

['scaler.joblib']

### **Feature Importance**

In [36]:
features = ["GRE", "TOEFL", "Rating", "SOP", "LOR", "CGPA", "Research"]

In [37]:
importance = model.feature_importances_

In [39]:
importance_df = pd.DataFrame({
    "Feature": features,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

In [40]:
importance_df.head()

Unnamed: 0,Feature,Importance
5,CGPA,0.475839
0,GRE,0.173557
3,SOP,0.08248
1,TOEFL,0.072929
6,Research,0.072813


In [41]:
importance_df.to_csv("importance_df.csv")