In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

data = pd.read_csv('interview_questions.csv')
print(data.head())  # Check the structure of your data


                                            Question Label
0                         1. Tell me about yourself.    HR
1         2. What are your strengths and weaknesses?    HR
2                   3. Why do you want to work here?    HR
3        4. Where do you see yourself in five years?    HR
4  5. Can you describe a challenge you overcame a...    HR


In [4]:
data['Question'] = data['Question'].str.lower()


In [5]:
data['Question'] = data['Question'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))


In [8]:
import re

data['Question'] = data['Question'].apply(lambda x: re.sub(r'^\d+\s*', '', x))

print(data.head())

                                            Question Label
0                             tell me about yourself    HR
1             what are your strengths and weaknesses    HR
2                       why do you want to work here    HR
3            where do you see yourself in five years    HR
4  can you describe a challenge you overcame at work    HR


In [10]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Question'])  # Features
y = data['Label']  # Labels


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [12]:
# Assuming 'Cleaned_Question' contains the cleaned text and 'Label' the target labels
X = data['Question']
y = data['Label']

# Vectorize the questions using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Split the data into train and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Initialize and train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test data
y_pred_log = log_reg.predict(X_test)

# Evaluate the Logistic Regression model
print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))
print(f"Accuracy: {accuracy_score(y_test, y_pred_log) * 100:.2f}%")


Logistic Regression Results:
              precision    recall  f1-score   support

          HR       0.95      1.00      0.97       386
   Technical       1.00      0.89      0.94       199

    accuracy                           0.96       585
   macro avg       0.97      0.95      0.96       585
weighted avg       0.97      0.96      0.96       585

Accuracy: 96.41%


In [14]:
# Initialize and train the SVM model
svm_model = SVC(kernel='linear')  # Linear kernel for text classification
svm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate the SVM model
print("SVM Results:")
print(classification_report(y_test, y_pred_svm))
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm) * 100:.2f}%")


SVM Results:
              precision    recall  f1-score   support

          HR       0.98      1.00      0.99       386
   Technical       1.00      0.97      0.98       199

    accuracy                           0.99       585
   macro avg       0.99      0.98      0.99       585
weighted avg       0.99      0.99      0.99       585

Accuracy: 98.97%


In [17]:
def classify_question(question):
    # Preprocess the question: remove punctuation, question numbers, and transform to lowercase
    cleaned_question = re.sub(r'^\d+\s*', '', question)  # Remove question numbers
    cleaned_question = cleaned_question.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    cleaned_question = cleaned_question.lower()  # Convert to lowercase
    
    # Vectorize the cleaned question
    question_vectorized = vectorizer.transform([cleaned_question])  # Use the same vectorizer
    
    # Predict using the trained model (you can use log_reg or svm_model)
    prediction = svm_model.predict(question_vectorized)  # Change to svm_model if needed
    
    return prediction[0]  # Return the predicted label


In [18]:
# Example questions to classify
test_questions = [
    "What is your greatest strength?",
    "Explain the concept of polymorphism.",
    "How do you handle conflict in the workplace?",
    "What are the different types of database normalization?"
]

# Classify each question
for question in test_questions:
    label = classify_question(question)
    print(f"Question: '{question}' is classified as: {label}")


Question: 'What is your greatest strength?' is classified as: HR
Question: 'Explain the concept of polymorphism.' is classified as: Technical
Question: 'How do you handle conflict in the workplace?' is classified as: HR
Question: 'What are the different types of database normalization?' is classified as: Technical


In [19]:
import pandas as pd
import re
import string
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import tkinter as tk
from tkinter import messagebox

In [21]:
# Save the model and vectorizer for later use
joblib.dump(log_reg, 'logistic_regression_model.pkl')
# Save the model and vectorizer for later use
joblib.dump(svm_model, 'smv_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [22]:
# Step 5: Create the GUI
def classify_question(question):
    cleaned_question = re.sub(r'^\d+\s*', '', question)  # Remove question numbers
    cleaned_question = cleaned_question.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    cleaned_question = cleaned_question.lower()  # Convert to lowercase
    question_vectorized = vectorizer.transform([cleaned_question])
    prediction = log_reg.predict(question_vectorized)
    return prediction[0]  # Return the predicted label

def on_classify():
    question = entry.get()
    if question:
        label = classify_question(question)
        messagebox.showinfo("Result", f"The question is classified as: {label}")
    else:
        messagebox.showwarning("Input Error", "Please enter a question.")


In [23]:
# Set up the main window
root = tk.Tk()
root.title("Question Classifier")

# Create a label and entry for the question
label = tk.Label(root, text="Enter your question:")
label.pack(pady=10)

entry = tk.Entry(root, width=50)
entry.pack(pady=10)

# Create a classify button
button = tk.Button(root, text="Classify Question", command=on_classify)
button.pack(pady=20)

# Start the GUI loop
root.mainloop()