In [54]:
import pandas as pd

df1 = pd.read_csv('School Data - Baramati.csv')  
df2 = pd.read_csv('School Data - Phaltan.csv') 

df_combined = pd.concat([df1, df2], ignore_index=True)

print(df_combined.head())


   S No                      School Name  Establishment Region  \
0   1.0                     Anad Vid Hol         1978.0  Rural   
1   2.0      Anand Eng Med Public school         2013.0  Rural   
2   3.0                       Anandnagar         2005.0  Rural   
3   4.0  New English School Pandharwasti         1989.0  Rural   
4   5.0                   Zpps Aathphata         1948.0  Rural   

  Pre-primary section (Y/N)  Classes medium of instructions   Management  \
0                        No  5 to 12                Marathi    Pvt.Aided   
1                       Yes   1 to 3                English  Pvt.Unaided   
2                        No   1 to 4                Marathi   Local Body   
3                        No  5 to 10                Marathi    Pvt.Aided   
4                        No   1 to 4                Marathi   Local Body   

    Cluster  Sr.No  
0  Athphata    NaN  
1  Athphata    NaN  
2  Athphata    NaN  
3  Athphata    NaN  
4  Athphata    NaN  


In [55]:
# Clean column names
df_combined.columns = df_combined.columns.str.strip().str.lower().str.replace(" ", "_")

# Optional: Clean values (like Pre-primary section and Classes)
df_combined['pre-primary_section_(y/n)'] = df_combined['pre-primary_section_(y/n)'].str.strip().str.upper()
df_combined['classes'] = df_combined['classes'].str.strip().str.upper()
df_combined['medium_of_instructions'] = df_combined['medium_of_instructions'].str.strip().str.capitalize()
df_combined['management'] = df_combined['management'].str.lower().str.replace('.', '').str.replace(' ', '')


In [56]:
from sklearn.preprocessing import LabelEncoder

# Drop unneeded columns like Sr. No or School Name (they are identifiers)
df_model = df_combined.drop(['s_no', 'school_name'], axis=1)

# Encode all string columns
for col in df_model.select_dtypes(include='object').columns:
    df_model[col] = LabelEncoder().fit_transform(df_model[col])

print(df_model.head())


   establishment  region  pre-primary_section_(y/n)  classes  \
0         1978.0       0                          1       17   
1         2013.0       0                          3        4   
2         2005.0       0                          1        5   
3         1989.0       0                          1       15   
4         1948.0       0                          1        5   

   medium_of_instructions  management  cluster  sr.no  
0                       3           2        2    NaN  
1                       0           3        2    NaN  
2                       3           0        2    NaN  
3                       3           2        2    NaN  
4                       3           0        2    NaN  


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Define X and y
X = df_model.drop('cluster', axis=1)
y = df_model['cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.46060606060606063
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.50      0.67      0.57         3
           2       0.00      0.00      0.00         3
           3       0.78      1.00      0.88         7
           4       0.50      0.33      0.40         3
           5       0.00      0.00      0.00         3
           6       0.00      0.00      0.00         4
           7       0.50      0.67      0.57         3
           8       0.43      0.50      0.46         6
           9       0.00      0.00      0.00         3
          10       0.75      1.00      0.86         3
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         3
          13       1.00      0.50      0.67         2
          14       0.00      0.00      0.00         2
          15       0.57      1.00      0.73         4
          16       0.00      0.00      0.00        

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [58]:
import joblib
joblib.dump(model, 'school_cluster_predictor.pkl')


['school_cluster_predictor.pkl']