In [None]:
import pandas as pd
import numpy as np
import random

def generate_student_dataset(num_students=1000):
    data = {
        'English': [],
        'Regional Language': [],
        'Mathematics': [],
        'Physics': [],
        'Chemistry': [],
        'Biology': [],
        'History': [],
        'Geography': [],
        'Civics': [],
        'Economics': [],
        'Computer Applications': [],
        'Physical Education': [],
        'Stream': []
    }

    for _ in range(num_students):
        # Generate random marks for each subject (out of 100)
        english = random.randint(30, 98)
        regional_language = random.randint(30, 98)
        mathematics = random.randint(20, 100)
        physics = random.randint(20, 100)
        chemistry = random.randint(20, 100)
        biology = random.randint(20, 100)
        history = random.randint(30, 95)
        geography = random.randint(30, 95)
        civics = random.randint(30, 95)
        economics = random.randint(30, 95)
        computer_applications = random.randint(30, 98)
        physical_education = random.randint(30, 98)

        # Determine stream based on marks (with some randomness)
        science_score = (mathematics + physics + chemistry + biology) / 4
        commerce_score = (mathematics + economics + civics + geography) / 4
        arts_score = (history + geography + civics + economics + regional_language + english)/6

        # Adding some randomness and weighting to the choice.
        science_preference = science_score + random.uniform(-10, 10)
        commerce_preference = commerce_score + random.uniform(-10, 10)
        arts_preference = arts_score + random.uniform(-10, 10)

        if science_preference > commerce_preference and science_preference > arts_preference:
            stream = 'Science'
        elif commerce_preference > science_preference and commerce_preference > arts_preference:
            stream = 'Commerce'
        else:
            stream = 'Arts'

        # Add data to the dictionary
        data['English'].append(english)
        data['Regional Language'].append(regional_language)
        data['Mathematics'].append(mathematics)
        data['Physics'].append(physics)
        data['Chemistry'].append(chemistry)
        data['Biology'].append(biology)
        data['History'].append(history)
        data['Geography'].append(geography)
        data['Civics'].append(civics)
        data['Economics'].append(economics)
        data['Computer Applications'].append(computer_applications)
        data['Physical Education'].append(physical_education)
        data['Stream'].append(stream)

    df = pd.DataFrame(data)
    return df

student_data = generate_student_dataset(1000)

print(student_data.head())

print("\nStream Distribution:")
print(student_data['Stream'].value_counts())

print("\nCSV Format Dataset:\n")
print(student_data.to_csv(index=False))

In [4]:
df = pd.read_csv('/content/student_data.csv')

In [5]:
df.head()

Unnamed: 0,English,Regional Language,Mathematics,Physics,Chemistry,Biology,History,Geography,Civics,Economics,Computer Applications,Physical Education,Stream
0,84,78,53,84,97,41,37,67,95,53,70,47,Science
1,55,46,84,22,86,96,58,44,41,58,81,85,Science
2,55,94,24,44,57,32,72,72,51,80,62,63,Arts
3,35,54,36,46,21,70,75,59,74,51,88,47,Arts
4,79,97,20,22,38,37,57,84,94,83,32,83,Arts


In [6]:
df.isnull().sum()

Unnamed: 0,0
English,0
Regional Language,0
Mathematics,0
Physics,0
Chemistry,0
Biology,0
History,0
Geography,0
Civics,0
Economics,0


In [8]:
#Data preprocessing
#Label encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(df['Stream'])

In [11]:
df['Stream'] = encoder.transform(df['Stream'])

In [12]:
df.head()

Unnamed: 0,English,Regional Language,Mathematics,Physics,Chemistry,Biology,History,Geography,Civics,Economics,Computer Applications,Physical Education,Stream
0,84,78,53,84,97,41,37,67,95,53,70,47,2
1,55,46,84,22,86,96,58,44,41,58,81,85,2
2,55,94,24,44,57,32,72,72,51,80,62,63,0
3,35,54,36,46,21,70,75,59,74,51,88,47,0
4,79,97,20,22,38,37,57,84,94,83,32,83,0


In [55]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.iloc[:,:-1]
y = df['Stream']
# Always split first before scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# THEN scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [56]:
#Using grid search with CV
#Importing major models for classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB

In [57]:
#Defining a function to get the score of each model to compare
def get_score(model, X_train, X_test,y_train,y_test):
    model.fit(X_train,y_train)
    return model.score(X_test,y_test)

In [26]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
scores_lr = []
scores_svm = []
scores_rf = []
scores_knn = []
scores_GB = []
scores_XGB = []
scores_GNB = []

lr = LogisticRegression()
svm = SVC()
rf = RandomForestClassifier()
knn = KNeighborsClassifier()
GB = GradientBoostingClassifier()
XGB = XGBClassifier()
GNB = GaussianNB()

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    scores_lr.append(get_score(lr, X_train, X_test, y_train, y_test))
    scores_svm.append(get_score(svm, X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(rf, X_train, X_test, y_train, y_test))
    scores_knn.append(get_score(knn, X_train, X_test, y_train, y_test))
    scores_GB.append(get_score(GB, X_train, X_test, y_train, y_test))
    scores_XGB.append(get_score(XGB, X_train, X_test, y_train, y_test))
    scores_GNB.append(get_score(GNB, X_train, X_test, y_train, y_test))

In [27]:
print(scores_lr)

[0.6946107784431138, 0.7177177177177178, 0.7267267267267268]


In [28]:
print("Logistic Regression:", sum(scores_lr)/len(scores_lr))
print("SVM:", sum(scores_svm)/len(scores_svm))
print("Random Forest:", sum(scores_rf)/len(scores_rf))
print("KNN:", sum(scores_knn)/len(scores_knn))
print("Gradient Boosting:", sum(scores_GB)/len(scores_GB))
print("XGBoost:", sum(scores_XGB)/len(scores_XGB))
print("Naive Bayes:", sum(scores_GNB)/len(scores_GNB))

Logistic Regression: 0.713018407629186
SVM: 0.6960013906121691
Random Forest: 0.6649823476170782
KNN: 0.5819771867676059
Gradient Boosting: 0.6609603615591639
XGBoost: 0.6569773366180551
Naive Bayes: 0.7059964155772539


In [58]:
#Parameter grid for grid search
param_grid_dict = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'lbfgs'],
        'max_iter': [100, 200]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'GradientBoostingClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'XGBClassifier': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    },
    'GaussianNB': {
        # No major hyperparameters, but you can tune priors or var_smoothing
        'var_smoothing': [1e-09, 1e-08, 1e-07]
    }
}


In [59]:
selected_models = {
    "LogisticRegression" : LogisticRegression(),
    "SVC" : SVC(),
    "RandomForestClassifier" : RandomForestClassifier(random_state=42),
    "KNeighborsClassifier" : KNeighborsClassifier(),
    "GradientBoostingClassifier" : GradientBoostingClassifier(),
    "XGBClassifier" : XGBClassifier(),
    "GaussianNB" : GaussianNB()
}

In [60]:
from sklearn.model_selection import RandomizedSearchCV
best_models = {}
for name, model in selected_models.items():
    print(f"Tuning {name}....")
    search = RandomizedSearchCV(model, param_grid_dict[name], n_iter=5,cv=5,scoring="accuracy",n_jobs=-1,verbose=1,random_state=42)
    search.fit(X_train_scaled,y_train)
    best_models[name] = search.best_estimator_
    print(f"Best params for {name}: {search.best_params_}\n")

Tuning LogisticRegression....
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for LogisticRegression: {'solver': 'lbfgs', 'max_iter': 100, 'C': 10}

Tuning SVC....
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for SVC: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}

Tuning RandomForestClassifier....
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for RandomForestClassifier: {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 5}

Tuning KNeighborsClassifier....
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for KNeighborsClassifier: {'weights': 'distance', 'n_neighbors': 7, 'metric': 'manhattan'}

Tuning GradientBoostingClassifier....
Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best params for GradientBoostingClassifier: {'n_estimators': 50, 'max_depth': 3, 'learning_rate': 0.1}

Tuning XGBClassifier....
Fitting 5 folds for each of 5 candidat



In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

tuning_results = {}
for name, model in best_models.items():
    y_pred = model.predict(X_test_scaled)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    tuning_results[name] = [accuracy,precision,recall,f1]

tuning_results_df = pd.DataFrame(tuning_results, index=["accuracy", "precision", "recall", "f1"]).T
print("\nFinal model performance after tuning:\n", tuning_results_df)


Final model performance after tuning:
                             accuracy  precision  recall        f1
LogisticRegression             0.760   0.763886   0.760  0.758001
SVC                            0.755   0.758070   0.755  0.752913
RandomForestClassifier         0.660   0.674304   0.660  0.651007
KNeighborsClassifier           0.640   0.642899   0.640  0.637984
GradientBoostingClassifier     0.700   0.707983   0.700  0.697709
XGBClassifier                  0.715   0.724302   0.715  0.711708
GaussianNB                     0.740   0.743795   0.740  0.738220


In [None]:
'''Logistic regression is the best model to save and use for prediction
with Best params for LogisticRegression: {'solver': 'lbfgs', 'max_iter': 100, 'C': 10}'''

In [63]:
from sklearn.linear_model import LogisticRegression

lr_saved = LogisticRegression(solver= 'lbfgs', max_iter= 100, C= 10)
lr_saved.fit(X_train_scaled,y_train)

In [64]:
y_pred = lr_saved.predict(X_test_scaled)

In [65]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [66]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

In [67]:
print(accuracy)
print(precision)
print(recall)
print(f1)

0.76
0.7638855709507882
0.76
0.758000508517671


In [69]:
import joblib
joblib.dump(lr_saved,'stream_predict.pkl')

['stream_predict.pkl']

In [70]:
m = joblib.load("stream_predict.pkl")

In [73]:
#Prdicting the simple input
sample_input = np.array([[75, 80, 85, 90, 70, 65, 88, 92, 78, 81, 74, 86]])

In [75]:
# Scaling the input
sample_input_scaled = scaler.transform(sample_input)

# Predicting using the saved Logistic Regression model
prediction = m.predict(sample_input_scaled)
#print("Predicted Class:", prediction[0])

if prediction[0] == 0:
  print("Student may select Arts")
elif prediction[0] == 1:
  print("Student may select commerce")
else:
  print("Student may select Science")

Student may select commerce




In [77]:
import pickle

with open("stream_predict.pkl", "wb") as f:
    pickle.dump(m, f)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

from google.colab import files
files.download("stream_predict.pkl")
files.download("scaler.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>