In [None]:
pip install xgboost


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb



In [15]:
# Load the data
data = pd.read_csv("data.csv")



In [16]:
# Preprocess the data
data["experience_rating"] = data["experience_rating"].map({"Positive Experience": 1, "Neutral Experience": 0})
data["interview_rating"] = data["interview_rating"].map({"Easy Interview": 1, "Average Interview": 2, "Difficult Interview": 3})
X = data["interview_process"]
y = data["offer_acceptance"]



In [17]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [18]:
# Define the feature extraction methods
count_vect = CountVectorizer(stop_words="english", ngram_range=(1,2))
tfidf_vect = TfidfVectorizer(stop_words="english", ngram_range=(1,2))



In [19]:
# Extract features from the training set using both methods
X_train_count = count_vect.fit_transform(X_train)
X_train_tfidf = tfidf_vect.fit_transform(X_train)



In [20]:
# Train and evaluate several classifiers
classifiers = [
    {
        "name": "Logistic Regression",
        "model": LogisticRegression(max_iter=1000),
        "params": {"C": [0.1, 1, 10]}
    },
    {
        "name": "Naive Bayes",
        "model": MultinomialNB(),
        "params": {"alpha": [0.1, 0.5, 1]}
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeClassifier(),
        "params": {"max_depth": [5, 10, 20]}
    },
    {
        "name": "Random Forest",
        "model": RandomForestClassifier(n_estimators=100),
        "params": {"max_depth": [5, 10, 20]}
    },
    {
        "name": "Gradient Boosting",
        "model": GradientBoostingClassifier(),
        "params": {"n_estimators": [50, 100, 200]}
    },
    {
        "name": "XGBoost",
        "model": xgb.XGBClassifier(),
        "params": {"max_depth": [5, 10, 20], "learning_rate": [0.01, 0.1, 1]}
    }
]



In [None]:
for clf in classifiers:
    print(f"Training {clf['name']}...")
    grid_search = GridSearchCV(clf["model"], clf["params"], cv=5, n_jobs=-1)
    grid_search.fit(X_train_count, y_train)
    y_pred_count = grid_search.predict(count_vect.transform(X_test))
    accuracy_count = accuracy_score(y_test, y_pred_count)
    f1_count = f1_score(y_test, y_pred_count, average='weighted')
    print(f"Count Vectorizer Accuracy: {accuracy_count}")
    print(f"Count Vectorizer F1 Score: {f1_count}")
    grid_search.fit(X_train_tfidf, y_train)
    y_pred_tfidf = grid_search.predict(tfidf_vect.transform(X_test))
    accuracy_tfidf = accuracy_score(y_test, y_pred_tfidf)
    f1_tfidf = f1_score(y_test, y_pred_tfidf, average='weighted')
    print(f"Tfidf Vectorizer Accuracy: {accuracy_tfidf}")
    print(f"Tfidf Vectorizer F1 Score: {f1_tfidf}")

Training Logistic Regression...
Count Vectorizer Accuracy: 0.6457973019716361
Count Vectorizer F1 Score: 0.6129211470010166
Tfidf Vectorizer Accuracy: 0.6444136976824628
Tfidf Vectorizer F1 Score: 0.6162937730249137
Training Naive Bayes...
Count Vectorizer Accuracy: 0.6312694569353166
Count Vectorizer F1 Score: 0.5816468763829895
Tfidf Vectorizer Accuracy: 0.6295399515738499
Tfidf Vectorizer F1 Score: 0.575927053164404
Training Decision Tree...
Count Vectorizer Accuracy: 0.5921826357661709
Count Vectorizer F1 Score: 0.570468713328316
Tfidf Vectorizer Accuracy: 0.5800760982359046
Tfidf Vectorizer F1 Score: 0.5559295572215193
Training Random Forest...
Count Vectorizer Accuracy: 0.5887236250432376
Count Vectorizer F1 Score: 0.43742032384261376
Tfidf Vectorizer Accuracy: 0.588031822898651
Tfidf Vectorizer F1 Score: 0.4359468523821798
Training Gradient Boosting...
