In [86]:
import pandas as pd
import ast
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [87]:
# Load the dataset
data = pd.read_csv('./all_students_data.csv')
data.head()

Unnamed: 0,Registration Number,Name,CGPA,Projects,Internships,Courses,No_of_Certs,Certifications,LeetCode Score,Placement Status,Company Portfolio,10th Grade Percentage,12th Grade Percentage,CTC,IQ,Gender,Skills,Skill Score
0,20CSE0092,Ashley Esparza,8.66,10,['UI Design'],"['CSC Frontend', 'CSC Frontend', 'VLSI', 'Clou...",4,4,234,Not Placed,,77,82,6 LPA,82,Female,"['Java', 'Machine Learning', 'Web Development'...",4
1,20CSE0181,Jessica Jensen,6.35,3,"['UI Design', 'ML/AI', 'Business Management']","['CSC Frontend', 'System Design']",2,5,23,Not Placed,,97,63,1 CR LPA,104,Female,"['Web Development', 'Database Management', 'Co...",10
2,20CSE0008,Miss Elizabeth Alvarez,6.32,5,"['Cloud Engineer', 'ML/AI']","['Cloud Engineer', 'Chip Design']",2,2,487,Placed,DREAM,64,76,30 LPA,138,Male,"['Database Management', 'Web Development', 'Py...",6
3,20EEE0190,Amanda Alexander,7.32,4,['Chip Design'],"['Chip Design', 'ML Engineer']",2,2,63,Not Placed,,67,74,20 LPA,134,Female,['Machine Learning'],4
4,20CSE0003,Douglas Gonzalez,8.57,6,"['System Design', 'Hardware Engineer']","['Cloud Engineer', 'System Design', 'ML Engine...",3,9,36,Placed,DREAM,86,98,60 LPA,101,Female,"['Web Development', 'Machine Learning', 'Python']",4


In [88]:
# Sample input data
data['Internships'] = data['Internships'].apply(eval)  # Convert string representation to list of dictionaries

# Preprocess 'Internships' column to extract roles or positions and assign unique values
roles_dict = {}
next_value = 1

In [89]:
role_weights = {
    'Software Engineer': 1.2,
    'Hardware Engineer': 1.5,
    'UI Design': 0.8,
    'Robotics': 1.3,
    'Business Management': 0.9,
    'ML/AI': 2.0,
    'Cloud Engineer': 1.7,
    'System Design': 1.4
}

In [90]:
stream_weights = {
    'CSC Frontend': 1.2,
    'CSC Backend': 1.5,
    'VLSI': 2.3,
    'Chip Design': 1.5,
    'ML Engineer': 1.8,
    'Cloud Engineer': 1.7,
    'System Design': 2.4
}


In [91]:
skill_weights = {
    'Python': 1.5,
    'Java': 1.2,
    'C++': 1.0,
    'Data Analysis': 1.8,
    'Machine Learning': 2.0,
    'Web Development': 1.5,
    'Database Management': 1.2,
    'Communication': 1.2
}

In [92]:
def calculate_intern_weights(internships):
    if isinstance(internships, str):  # Check if internships is a string
        internships = [internships]  # Convert string to list with a single element
    total_weight = sum(role_weights.get(role, 0) for role in internships)
    return total_weight

# Calculate intern weights for each row
data['InternWeights'] = data['Internships'].apply(calculate_intern_weights)

In [93]:
def calculate_skill_score(skills):
    if isinstance(skills, str):  # Check if skills is a string
        skills = ast.literal_eval(skills)  # Convert string to list
    total_weight = sum(skill_weights.get(skill, 0) for skill in skills)
    return total_weight

# Calculate skill score for each row
data['Skill Score'] = data['Skills'].apply(calculate_skill_score)

In [94]:
def calculate_course_weights(courses):
    if isinstance(courses, str):  # Check if courses is a string
        courses = ast.literal_eval(courses)  # Convert string to list
    total_weight = sum(stream_weights.get(stream, 0) for stream in courses)
    return total_weight



# Calculate course weights for each row
data['CourseWeights'] = data['Courses'].apply(calculate_course_weights)

In [95]:
# Map 'Male' to 0 and 'Female' to 1
data['Gender'] = data['Gender'].map({'Male': 0, 'Female': 1})
#print(data.columns)



In [96]:
print(data['InternWeights'])

0      0.8
1      3.7
2      3.7
3      0.0
4      2.9
      ... 
994    4.0
995    1.3
996    4.1
997    0.0
998    2.5
Name: InternWeights, Length: 999, dtype: float64


In [97]:

# Display the DataFrame with the added CourseWeights column
print(data['CourseWeights'])

0      6.4
1      3.6
2      3.2
3      3.3
4      5.9
      ... 
994    7.1
995    4.7
996    3.8
997    2.7
998    5.8
Name: CourseWeights, Length: 999, dtype: float64


In [98]:
print(data['Skill Score'])

0      7.7
1      3.9
2      6.4
3      2.0
4      5.0
      ... 
994    5.0
995    6.1
996    1.5
997    6.9
998    3.7
Name: Skill Score, Length: 999, dtype: float64


In [99]:
# Map 'Placed' to 1 and 'Not Placed' to 0
data['Placement Status'] = data['Placement Status'].map({'Placed': 1, 'Not Placed': 0})

# Display the DataFrame with the updated 'Placement Status' column
print(data)


    Registration Number                    Name  CGPA  Projects  \
0             20CSE0092          Ashley Esparza  8.66        10   
1             20CSE0181          Jessica Jensen  6.35         3   
2             20CSE0008  Miss Elizabeth Alvarez  6.32         5   
3             20EEE0190        Amanda Alexander  7.32         4   
4             20CSE0003        Douglas Gonzalez  8.57         6   
..                  ...                     ...   ...       ...   
994           20EEE0305        Robert Mccormick  8.97         5   
995           20EEE0183           Stacy Shelton  8.61         2   
996           20ECE0275          Brittany Woods  9.76        10   
997           20CSE0163          Janice Walters  9.10         5   
998           20EEE0270             Laura Smith  7.01         8   

                                           Internships  \
0                                          [UI Design]   
1              [UI Design, ML/AI, Business Management]   
2                    

In [100]:
features = ['CGPA', 'LeetCode Score', 'Projects', 'Certifications', 'No_of_Certs', 
            '10th Grade Percentage', '12th Grade Percentage', 'InternWeights', 'CourseWeights', 'IQ', 'Gender', 'Skill Score']
target = 'Placement Status'
X = data[features]
y = data[target]

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [102]:
# Initialize classifiers
rf_classifier = RandomForestClassifier(random_state=42)
xgb_classifier = XGBClassifier(random_state=42)

# Fine-tuning RandomForestClassifier
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [103]:
rf_grid_search = GridSearchCV(estimator=rf_classifier, param_grid=rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)

# Fine-tuning XGBClassifier
xgb_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

In [104]:

xgb_grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=xgb_param_grid, cv=5)
xgb_grid_search.fit(X_train, y_train)

# Best parameters for RandomForestClassifier and XGBClassifier
print("Best parameters for RandomForestClassifier:", rf_grid_search.best_params_)
print("Best parameters for XGBClassifier:", xgb_grid_search.best_params_)

Best parameters for RandomForestClassifier: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}
Best parameters for XGBClassifier: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.9}


In [105]:
dt_classifier = DecisionTreeClassifier()

In [106]:
 #Create ensemble model using VotingClassifier with fine-tuned parameters
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_classifier),
        ('xgb', xgb_classifier), 
    ],
    voting='soft'  # Soft voting for probabilities averaging
)


In [107]:
# Train ensemble model
ensemble_model.fit(X_train, y_train)

# Predict using ensemble model
ensemble_predictions = ensemble_model.predict(X_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, ensemble_predictions)
print(f"Ensemble Model Accuracy: {accuracy}")

Ensemble Model Accuracy: 0.64


In [118]:


# Sample test data with modified values
test_data = {
    'Registration Number': ['20MID0135'],
    'Name': ['Varun Babu'],
    'CGPA': [6.1],
    'Projects': [5],
    'Internships': [['UI Design','Cloud Engineer']],  
    'Courses': [['CSC Frontend', 'Cloud Engineer','ML Engineer']],  
    'No_of_Certs': [2],
    'Certifications': [4],
    'LeetCode Score': [234],
    '10th Grade Percentage': [97],
    '12th Grade Percentage': [92],
    'IQ': [95],  
    'Gender': [1],
    'Skills': [['Data Analysis']],
    'Skill Score': [5]
}

# Convert test data to DataFrame
test_df = pd.DataFrame(test_data)

# Calculate InternWeights, CourseWeights, and Skill Score for test data
test_df['InternWeights'] = test_df['Internships'].apply(calculate_intern_weights)
test_df['CourseWeights'] = test_df['Courses'].apply(calculate_course_weights)
test_df['SkillScore'] = test_df['Skills'].apply(calculate_skill_score)

# Display test DataFrame
#print(test_df)

# Define function to suggest skill improvements
def suggest_skill_improvements(skill_score):
    suggestions = []
    if skill_score <= 2:
        suggestions.append("Improve your skills: ML")
    if 'Python' not in test_data['Skills'][0]:
        suggestions.append("Learn Python")
    if 'Java' not in test_data['Skills'][0]:
        suggestions.append("Learn Java")
    return suggestions

# Define function to determine likelihood of placement
def likelihood_of_placement(skill_score):
    if skill_score <= 2:
        return "Regular"
    elif skill_score <= 4:
        return "Dream"
    else:
        return "Super Dream"

# Function to suggest companies based on likelihood of placement
def suggest_companies(placement_likelihood):
    if placement_likelihood == "Regular":
        return ["IBM", "CodeVault", "Volvo"]
    elif placement_likelihood == "Dream":
        return ["Altair", "ALE", "Intel", "Zomato"]
    else:
        return ["AMD", "Marvell", "JP Morgan", "Uber"]

# Extract features for prediction
features = ['CGPA', 'LeetCode Score', 'Projects', 'Certifications', 'No_of_Certs', 
            '10th Grade Percentage', '12th Grade Percentage', 'InternWeights', 
            'CourseWeights', 'IQ', 'Gender', 'Skill Score']
test_features = test_df[features]

# Predict placement status for test data
test_predictions = ensemble_model.predict(test_features)

if test_predictions == 0:
    print("Better luck next time!")
else:
    print("Predicted Placement Status for Test Data:", test_predictions)
    # Calculate skill score for test data
    test_skill_score = test_df['SkillScore'].iloc[0]

    # Suggest skill improvements
    print("Skill Improvement Suggestions:", suggest_skill_improvements(test_skill_score))

    # Determine likelihood of placement
    placement_likelihood = likelihood_of_placement(test_skill_score)
    print("Likelihood of Placement:", placement_likelihood)

    # Suggest companies based on likelihood of placement
    print("Suggested Companies:", suggest_companies(placement_likelihood))


Predicted Placement Status for Test Data: [1]
Skill Improvement Suggestions: ['Improve your skills: ML', 'Learn Python', 'Learn Java']
Likelihood of Placement: Regular
Suggested Companies: ['IBM', 'CodeVault', 'Volvo']


In [110]:
import joblib

joblib.dump(ensemble_model, 'model.pkl')

['model.pkl']