In [3]:
import pandas as pd
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score



In [4]:
#Load Dataset
from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
data=pd.read_excel(r'/content/gdrive/My Drive/8750/excel1.xlsx')

Mounted at /content/gdrive


In [5]:
# Data Cleaning and Preprocessing
# Dropping columns with more than 30% missing values
missing_values = data.isnull().sum()
missing_percentage = (missing_values / len(data)) * 100
columns_to_drop = missing_percentage[missing_percentage > 30].index
data_cleaned = data.drop(columns=columns_to_drop)

In [6]:
# Imputing missing values
numerical_columns = data_cleaned.select_dtypes(include=['number']).columns
data_cleaned[numerical_columns] = data_cleaned[numerical_columns].apply(lambda x: x.fillna(x.median()))
non_numerical_columns = data_cleaned.select_dtypes(exclude=['number']).columns
data_cleaned[non_numerical_columns] = data_cleaned[non_numerical_columns].apply(lambda x: x.fillna(x.mode()[0]))



In [7]:
# Feature Selection
selected_features = [
    'Careers service rank 2022', 'International board (%) 2022',
    'International students (%) 2022', 'Aims achieved (%) 2022',
    'Internships (%)', 'Average course length (months)'
]
X = data_cleaned[selected_features]
y = data_cleaned['Rank'] <= 10  # Target variable: True if rank is in top 10, False otherwise

In [8]:
# Data Normalization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [10]:
# Model Building and Training
catboost_model = CatBoostClassifier(verbose=0)  # 'verbose=0' to avoid too much logging
catboost_model.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x7ab564e2e200>

In [13]:
# Convert the boolean target variable to an integer type
y = y.astype(int)

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Building and Training
catboost_model = CatBoostClassifier(verbose=0)  # 'verbose=0' to avoid too much logging
catboost_model.fit(X_train, y_train)

# Model Prediction and Evaluation
y_pred = catboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.92


In [15]:
# Example Prediction (Replace with real values for a new prediction)
new_data = [[15, 50, 60, 80, 90, 24]]  # Replace with real input values
new_data_scaled = scaler.transform(new_data)
predicted_probability = catboost_model.predict_proba(new_data_scaled)[0][1]
print(f'Predicted Probability of being in the top 10: {predicted_probability}')

Predicted Probability of being in the top 10: 0.1632555082416065




In [16]:
# Model Building and Training (from previous steps)
catboost_model = CatBoostClassifier(verbose=0)  # 'verbose=0' to avoid too much logging
catboost_model.fit(X_train, y_train)

# Function to take user input for features
def get_user_input():
    print("Please enter the values for the following features:")
    careers_service_rank = float(input("Careers service rank 2022: "))
    international_board = float(input("International board (%) 2022: "))
    international_students = float(input("International students (%) 2022: "))
    aims_achieved = float(input("Aims achieved (%) 2022: "))
    internships = float(input("Internships (%): "))
    course_length = float(input("Average course length (months): "))
    return [careers_service_rank, international_board, international_students, aims_achieved, internships, course_length]

# Getting user input
new_data = get_user_input()

# Normalizing and reshaping the input for prediction
new_data_scaled = scaler.transform([new_data])  # Note the input is wrapped in a list

# Predicting with the model
predicted_probability = catboost_model.predict_proba(new_data_scaled)[0][1]
print(f'Predicted Probability of being in the top 10: {predicted_probability}')

Please enter the values for the following features:
Careers service rank 2022: 56
International board (%) 2022: 80
International students (%) 2022: 30
Aims achieved (%) 2022: 65
Internships (%): 70
Average course length (months): 16
Predicted Probability of being in the top 10: 0.028189012810127506


