<a href="https://colab.research.google.com/github/abdullahkiani007/NeuralNet/blob/main/predictiveModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Sample responses from a candidate
responses = [
    "I'm a software engineer with a passion for developing innovative programs that expedite the efficiency and effectiveness of organizational success.",
    "I demonstrated leadership when I led a team project to develop a new company website. I assigned tasks, set deadlines, and ensured that everyone was on track.",
    "During a team project, we faced a major technical issue. I organized a brainstorming session, and we collectively came up with a solution.",
    "One of my weaknesses is that I tend to be a perfectionist. I'm working on this by setting realistic goals and timeframes for my tasks.",
    "You should hire me because I have the skills and experience to make a significant contribution to your company. I'm a fast learner and can quickly adapt to new environments."
]

# Preprocess the responses
stop_words = set(stopwords.words('english'))
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

preprocessed_responses = [preprocess(response) for response in responses]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from nltk.probability import FreqDist
from nltk import pos_tag
from textblob import TextBlob

# Download the necessary resource
nltk.download('averaged_perceptron_tagger')

# Extract features
def extract_features(text):
    features = {}

    # Word count
    features['word_count'] = len(text)

    # Sentence count
    features['sentence_count'] = len(nltk.sent_tokenize(' '.join(text)))

    # Parts of speech
    pos_counts = FreqDist(tag for (word, tag) in pos_tag(text))
    features['noun_count'] = pos_counts['NN'] + pos_counts['NNS']
    features['verb_count'] = pos_counts['VB'] + pos_counts['VBD'] + pos_counts['VBG'] + pos_counts['VBN'] + pos_counts['VBP'] + pos_counts['VBZ']
    features['adjective_count'] = pos_counts['JJ'] + pos_counts['JJR'] + pos_counts['JJS']
    features['adverb_count'] = pos_counts['RB'] + pos_counts['RBR'] + pos_counts['RBS']

    # Sentiment analysis
    blob = TextBlob(' '.join(text))
    features['polarity'] = blob.sentiment.polarity
    features['subjectivity'] = blob.sentiment.subjectivity

    return features

# Apply feature extraction to all responses
# print(preprocessed_responses)
features = [extract_features(response) for response in preprocessed_responses]
print(features)


[{'word_count': 11, 'sentence_count': 1, 'noun_count': 6, 'verb_count': 2, 'adjective_count': 3, 'adverb_count': 0, 'polarity': 0.4, 'subjectivity': 0.5}, {'word_count': 16, 'sentence_count': 1, 'noun_count': 9, 'verb_count': 5, 'adjective_count': 2, 'adverb_count': 0, 'polarity': 0.13636363636363635, 'subjectivity': 0.45454545454545453}, {'word_count': 12, 'sentence_count': 1, 'noun_count': 5, 'verb_count': 4, 'adjective_count': 2, 'adverb_count': 1, 'polarity': 0.03125, 'subjectivity': 0.3}, {'word_count': 10, 'sentence_count': 1, 'noun_count': 4, 'verb_count': 4, 'adjective_count': 1, 'adverb_count': 0, 'polarity': 0.16666666666666666, 'subjectivity': 0.3333333333333333}, {'word_count': 13, 'sentence_count': 1, 'noun_count': 5, 'verb_count': 3, 'adjective_count': 3, 'adverb_count': 2, 'polarity': 0.26117424242424236, 'subjectivity': 0.6073863636363637}]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Sample ratings (to be replaced with actual data)
ratings = {
    'Overall': [5, 4, 4, 3, 5],
    'RecommendHiring': [1, 1, 1, 0, 1],
    'Colleague': [4, 4, 4, 3, 5],
    'Engaged': [5, 4, 4, 3, 5],
    'Excited': [5, 4, 4, 3, 5],
    'EyeContact': [5, 4, 4, 3, 5],
    'Smiled': [5, 4, 4, 3, 5],
    'SpeakingRate': [5, 4, 4, 3, 5],
    'NoFillers': [5, 4, 4, 3, 5],
    'Friendly': [5, 4, 4, 3, 5],
    'Paused': [1, 2, 2, 3, 1],
    'EngagingTone': [5, 4, 4, 3, 5],
    'StructuredAnswers': [5, 4, 4, 3, 5],
    'Calm': [5, 4, 4, 3, 5],
    'NotStressed': [5, 4, 4, 3, 5],
    'Focused': [5, 4, 4, 3, 5],
    'Authentic': [5, 4, 4, 3, 5],
    'NotAwkward': [5, 4, 4, 3, 5],
    'Total': [90, 76, 76, 57, 90]
}

# Convert to DataFrame
features_df = pd.DataFrame(features)
ratings_df = pd.DataFrame(ratings)

# Combine features and ratings
data = pd.concat([features_df, ratings_df], axis=1)

# Train-test split
X = data.drop(columns=['Total'])
y = data['Total']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)

# Output predictions
print(predictions)


[89.07885259]


In [None]:
turker_score = pd.read_csv('turker_scores_full_interview.csv')
new_score = turker_score[turker_score["Worker"] =="AGGR"]

new_score = new_score.drop(columns=['Worker'])
# print(new_score.columns)

# Reset the index and drop the old index
new_score = new_score.reset_index(drop=True)

print(new_score)
new_score.to_csv('new_turker_score.csv')

    Participant   Overall  RecommendHiring  Colleague   Engaged   Excited  \
0            p1  5.297316         5.106224   5.333004  5.541380  5.043890   
1            p3  4.414892         4.433070   5.010430  5.616076  5.601586   
2            p4  4.494494         4.530129   4.500707  5.494494  4.261343   
3            p5  5.457670         5.571558   5.772488  5.903057  4.707062   
4            p6  5.106512         4.831482   4.860595  5.020385  4.648259   
..          ...       ...              ...        ...       ...       ...   
133        pp83  6.045748         5.806617   6.347873  6.243224  5.806617   
134        pp84  5.710073         6.020304   5.437203  5.749959  5.307262   
135        pp85  5.626074         5.766592   5.791370  6.375623  5.584830   
136        pp86  4.853881         4.700179   4.495230  5.513933  5.137644   
137        pp89  4.960084         4.370067   5.143083  5.138736  4.407999   

     EyeContact    Smiled  SpeakingRate  NoFillers  Friendly    Paused  \
0

In [None]:
# Load the prosodic features dataset
prosodic_features_df = pd.read_csv('prosodic_features.csv')

# Extract participant ID from 'participant&question' column
prosodic_features_df['participant'] = prosodic_features_df['participant&question'].str.extract(r'(^P\d+)')

# Drop the 'participant&question' column as it's no longer needed
prosodic_features_df = prosodic_features_df.drop(columns=['participant&question'])

prosodic_features_df['avgVal3'] = pd.to_numeric(prosodic_features_df['avgVal3'], errors='coerce')
prosodic_features_df['avgBand3'] = pd.to_numeric(prosodic_features_df['avgBand3'], errors='coerce')

# Fill NaN values with mean
prosodic_features_df['avgVal3'] = prosodic_features_df['avgVal3'].fillna(prosodic_features_df['avgVal3'].mean())
prosodic_features_df['avgBand3'] = prosodic_features_df['avgBand3'].fillna(prosodic_features_df['avgBand3'].mean())



# Group by 'participant' and calculate the mean for each group
prosodic_features_avg_df = prosodic_features_df.groupby('participant').mean().reset_index()

# Save the processed data to a new CSV file (optional)
prosodic_features_avg_df.to_csv('processed_prosodic_features.csv', index=False)

# Print the processed DataFrame to verify
print(prosodic_features_avg_df.head())

  participant   duration    energy     power  min_pitch   max_pitch  \
0          P1  34.186796  0.011510  0.000333  78.064294  340.516454   
1         P10  77.378613  0.032118  0.000404  74.787772  345.935347   
2         P11  50.431333  0.007220  0.000134  73.605587  384.004524   
3         P12  35.910442  0.008005  0.000205  73.706414  326.346399   
4         P13  51.049858  0.001618  0.000029  75.141598  332.370097   

   mean_pitch   pitch_sd   pitch_abs  pitch_quant  ...  avgDurPause  \
0  129.307639  25.242104  176.176383   121.943026  ...       0.5312   
1  112.955696  25.915806  166.265804   106.681483  ...       0.7232   
2  186.524402  35.911648  226.353579   183.792011  ...       0.5326   
3  181.006693  40.494458  266.110211   184.081501  ...       0.9796   
4  181.156105  41.081781  249.154938   189.416587  ...       0.6952   

   TotDurPause:3  iInterval  MaxRising:3  MaxFalling:3  AvgTotRis:3  \
0        17.9984       87.8     211.4666      186.2474      20.4186   
1   

In [None]:
import pandas as pd

# Load the datasets
ratings_df = pd.read_csv('new_turker_score.csv')
lexical_features_df = pd.read_csv('interviewee_transcript_analysis_results.csv')
prosodic_features_df = pd.read_csv('processed_prosodic_features.csv')




# Drop the 'Unnamed: 0' column from all DataFrames if it exists
ratings_df = ratings_df.drop(columns=['Unnamed: 0'], errors='ignore')
lexical_features_df = lexical_features_df.drop(columns=['Unnamed: 0'], errors='ignore')
prosodic_features_df = prosodic_features_df.drop(columns=['Unnamed: 0'], errors='ignore')

# Ensure the 'Participant' column is the key for merging
ratings_df.rename(columns={'Participant': 'participant'}, inplace=True)
lexical_features_df.rename(columns={'Participant': 'participant'}, inplace=True)
prosodic_features_df.rename(columns={'participant&question': 'participant'}, inplace=True)

prosodic_features_df['participant'] = prosodic_features_df['participant'].str.lower()

# Merge the datasets on 'participant'
combined_df = pd.merge(ratings_df, lexical_features_df, on='participant', how='inner')
combined_df = pd.merge(combined_df, prosodic_features_df, on='participant', how='inner')
combined_df = combined_df.drop(columns=["loudness"])
# Check the combined DataFrame

print("prosodic shape", prosodic_features_df.shape)
print("lexical shape", lexical_features_df.shape)
print("ratings shape", ratings_df.shape)

print("Combined DataFrame:")
print(combined_df.shape)
# combined_df.to_csv('combined_data.csv', index=False)


prosodic shape (69, 60)
lexical shape (138, 22)
ratings shape (138, 20)
Combined DataFrame:
(69, 99)


In [None]:
# Define the target variable
target = 'RecommendHiring'

# Drop the columns not needed for features
X = combined_df.drop(columns=['participant', 'RecommendHiring'])

# Target variable
y = combined_df[target]


In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print(X_train.columns)

Index(['Overall', 'Colleague', 'Engaged', 'Excited', 'EyeContact', 'Smiled',
       'SpeakingRate', 'NoFillers', 'Friendly', 'Paused', 'EngagingTone',
       'StructuredAnswers', 'Calm', 'NotStressed', 'Focused', 'Authentic',
       'NotAwkward', 'Total', 'Filler Words', 'I Count', 'We Count',
       'They Count', 'Verb Count', 'Adverb Count', 'Preposition Count',
       'Conjunction Count', 'Positive Emotion Words', 'Negative Emotion Words',
       'Anxiety Words', 'Anger Words', 'Sadness Words', 'Cognitive Words',
       'Inhibition Words', 'Perceptual Words', 'Work-Related Words',
       'Articles', 'Negations', 'Quantifiers', 'Tentative Language',
       'duration', 'energy', 'power', 'min_pitch', 'max_pitch', 'mean_pitch',
       'pitch_sd', 'pitch_abs', 'pitch_quant', 'pitchUvsVRatio', 'Time:8',
       'iDifference', 'diffPitchMaxMin', 'diffPitchMaxMean',
       'diffPitchMaxMode', 'intensityMin', 'intensityMax', 'intensityMean',
       'intensitySD', 'intensityQuant', 'diffIntMa

In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 11.965809574362817
R^2 Score: -23.25532127622812


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = model.predict(X_test_scaled)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Cross-validation
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Cross-validated MSE: {-cv_scores.mean()}')


Mean Squared Error: 43.8048337568719
R^2 Score: -87.79468703071596
Cross-validated MSE: 0.1787672932324542


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize and train the RandomForest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'Random Forest Mean Squared Error: {mse_rf}')
print(f'Random Forest R^2 Score: {r2_rf}')


Random Forest Mean Squared Error: 0.07557677153825769
Random Forest R^2 Score: 0.8468019348554484


In [None]:
print(y_pred_rf)
print(y_test)

[3.75572597 5.10637257 4.96654198 4.95126741 4.41825861 5.80811498
 3.90308169 3.77526353 4.71704473 4.03868326 4.41796798 4.97090471
 5.11414636 5.46628648]
22    3.189661
0     5.106224
47    4.475493
4     4.831482
53    4.298151
18    6.015717
10    4.325522
33    3.560751
44    4.831596
12    3.963723
31    4.527523
9     4.925221
59    4.705135
5     5.473467
Name: RecommendHiring, dtype: float64


In [None]:
import joblib

# Save the trained Random Forest model
# joblib.dump(rf_model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

In [None]:
import pandas as pd
import joblib

# Load the model and scaler
model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')  # Load the scaler if you used one

# Create a DataFrame with new data
# Ensure the column names match those used during training
X_new = pd.DataFrame({
    'Overall': [2],
    'Colleague': [1],
    'Engaged': [3],
    'Excited': [2],
    'EyeContact': [1],
    'Smiled': [0],
    'SpeakingRate': [1.2],
    'NoFillers': [3],
    'Friendly': [1],
    'Paused': [2],
    'EngagingTone': [0],
    'StructuredAnswers': [1],
    'Calm': [0],
    'NotStressed': [1],
    'Focused': [0],
    'Authentic': [1],
    'NotAwkward': [0],
    'Total': [30],
    'Filler Words': [5],
    'I Count': [2],
    'We Count': [1],
    'They Count': [0],
    'Verb Count': [10],
    'Adverb Count': [4],
    'Preposition Count': [6],
    'Conjunction Count': [3],
    'Positive Emotion Words': [7],
    'Negative Emotion Words': [2],
    'Anxiety Words': [1],
    'Anger Words': [0],
    'Sadness Words': [1],
    'Cognitive Words': [8],
    'Inhibition Words': [2],
    'Perceptual Words': [5],
    'Work-Related Words': [4],
    'Articles': [12],
    'Negations': [1],
    'Quantifiers': [3],
    'Tentative Language': [2],
    'duration': [120.0],
    'energy': [1.5],
    'power': [0.5],
    'min_pitch': [100.0],
    'max_pitch': [300.0],
    'mean_pitch': [200.0],
    'pitch_sd': [20.0],
    'pitch_abs': [50.0],
    'pitch_quant': [30.0],
    'pitchUvsVRatio': [0.6],
    'Time:8': [0.3],
    'iDifference': [0.1],
    'diffPitchMaxMin': [200.0],
    'diffPitchMaxMean': [100.0],
    'diffPitchMaxMode': [50.0],
    'intensityMin': [0.4],
    'intensityMax': [1.2],
    'intensityMean': [0.8],
    'intensitySD': [0.2],
    'intensityQuant': [0.5],
    'diffIntMaxMin': [0.8],
    'diffIntMaxMean': [0.4],
    'diffIntMaxMode': [0.3],
    'avgVal1': [0.5],
    'avgVal2': [0.6],
    'avgVal3': [0.7],
    'avgBand1': [0.4],
    'avgBand2': [0.5],
    'avgBand3': [0.6],
    'fmean1': [0.3],
    'fmean2': [0.4],
    'fmean3': [0.5],
    'f2meanf1': [0.2],
    'f3meanf1': [0.3],
    'f1STD': [0.1],
    'f2STD': [0.2],
    'f3STD': [0.3],
    'f2STDf1': [0.1],
    'f2STDf2': [0.2],
    'jitter': [0.05],
    'shimmer': [0.1],
    'jitterRap': [0.02],
    'meanPeriod': [0.3],
    'percentUnvoiced': [0.1],
    'numVoiceBreaks': [2],
    'PercentBreaks': [0.15],
    'speakRate': [150.0],
    'numPause': [3],
    'maxDurPause': [0.5],
    'avgDurPause': [0.3],
    'TotDurPause:3': [0.8],
    'iInterval': [0.1],
    'MaxRising:3': [0.2],
    'MaxFalling:3': [0.3],
    'AvgTotRis:3': [0.2],
    'AvgTotFall:3': [0.3],
    'numRising': [5],
    'numFall': [3]
})

# Preprocess the new data
X_new_scaled = scaler.transform(X_new)  # Apply scaling if needed

# Predict using the trained model
predictions = model.predict(X_new_scaled)

print(predictions)


[3.64978231]
