In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/reviews_sentimental_analysis.csv', delimiter=',')

# Replace null or NaN values with empty string
df['r_cons'] = df['r_cons'].fillna('')
df['r_pros'] = df['r_pros'].fillna('')

# Vectorize the review text using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['r_cons'] + ' ' + df['r_pros'])

# Train a linear regression model to predict the company rating based on the review text
model = LinearRegression()
model.fit(X, df['r_company_rating'])



In [8]:
# Predict the overall employer score based on the pros and cons
df['overall_score'] = model.predict(X)

# # Clip the predicted scores to the range of 1-10
# df['overall_score'] = np.clip(df['overall_score'], 1, 5)

# Group the data by employer name and calculate the mean overall score for each employer
df_mean = df.groupby(['header_employer_name'])['overall_score'].mean().reset_index()

# # Evaluate the performance of the model on the training data
# y_pred_train = model.predict(X)
# r2 = r2_score(df['r_company_rating'], y_pred_train)
# print('Training R^2 score: {:.2f}'.format(r2))

# K-fold cross validation
# Split the data into k folds
k = 5
folds = np.array_split(df, k)

# Perform k-fold cross-validation
scores = []
for i in range(k):
    # Split the data into training and validation sets
    validation_data = folds[i]
    training_data = pd.concat([fold for j, fold in enumerate(folds) if j != i])
    
    # Train the model on the training data
    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(training_data['r_cons'] + ' ' + training_data['r_pros'])
    y_train = training_data['r_company_rating']
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Evaluate the model on the validation data
    X_val = vectorizer.transform(validation_data['r_cons'] + ' ' + validation_data['r_pros'])
    y_val = validation_data['r_company_rating']
    y_pred = model.predict(X_val)
    score = r2_score(y_val, y_pred)
    scores.append(score)

# Compute the mean score across all folds
mean_score = np.mean(scores)
print('Mean R-squared:', mean_score)


# Create a bar plot with employer names on the x-axis and predicted overall employer scores on the y-axis
plt.bar(df_mean['header_employer_name'], df_mean['overall_score'])
plt.xlabel('Employer Name')
plt.ylabel('Predicted Overall Employer Score')
plt.show()

Mean R-squared: -1.1207752991997757
