In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/reviews_sentimental_analysis.csv', delimiter=',')

# Replace null or NaN values with empty string
df['r_cons'] = df['r_cons'].fillna('')
df['r_pros'] = df['r_pros'].fillna('')

# Create a dictionary to map company names to numeric IDs
companies = sorted(list(set(df['header_employer_name'].values)))
company_to_id = {company: i for i, company in enumerate(companies)}
id_to_company = {i: company for company, i in company_to_id.items()}
df['company_id'] = df['header_employer_name'].apply(lambda x: company_to_id[x])

# Define a function to tokenize a text string into words
def tokenize(text):
    return text.split()

# Define a function to compute the TF-IDF matrix
def tfidf_matrix(texts):
    # Compute the document frequency of each word
    word_df = {}
    for text in texts:
        for word in set(tokenize(text)):
            word_df[word] = word_df.get(word, 0) + 1
    # Compute the inverse document frequency of each word
    num_docs = len(texts)
    word_idf = {}
    for word, df in word_df.items():
        word_idf[word] = np.log(num_docs / df)
    # Compute the term frequency-inverse document frequency matrix
    tfidf_matrix = np.zeros((len(texts), len(word_idf)))
    for i, text in enumerate(texts):
        tf = {}
        for word in tokenize(text):
            tf[word] = tf.get(word, 0) + 1
        for j, word in enumerate(word_idf.keys()):
            if word in tf:
                tfidf_matrix[i, j] = tf[word] * word_idf[word]
    return tfidf_matrix


# Compute the TF-IDF matrix for the pros and cons
X = tfidf_matrix(df['r_cons'] + ' ' + df['r_pros'])

# Train a linear regression model to predict the company rating based on the TF-IDF matrix
X_train = X[:5000, :]
y_train = df['r_company_rating'][:5000]
try:
    w = np.linalg.inv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)
except np.linalg.LinAlgError:
    w = np.linalg.pinv(X_train.T.dot(X_train)).dot(X_train.T).dot(y_train)

# Predict the overall employer score based on the pros and cons
df['overall_score'] = X.dot(w)

# Group the data by employer ID and calculate the mean overall score for each employer
df_mean = df.groupby(['company_id'])['overall_score'].mean().reset_index()

# Create a bar plot with employer names on the x-axis and predicted overall employer scores on the y-axis
plt.bar([id_to_company[i] for i in df_mean['company_id']], df_mean['overall_score'])
plt.xlabel('Employer Name')
plt.ylabel('Predicted Overall Employer Score')
plt.show()




In [None]:
# Define the list of employer names to test
employer_names = ['Atlassian', 'Google', 'Intel', 'Siemplify', 'Forter']

# Filter the dataframe to only include the selected employer names
df_filtered = df[df['header_employer_name'].isin(employer_names)]

# Group the filtered data by employer name and calculate the mean overall score for each employer
df_mean = df_filtered.groupby(['header_employer_name'])['overall_score'].mean().reset_index()

# Create a bar plot with employer names on the x-axis and predicted overall employer scores on the y-axis
plt.bar(df_mean['header_employer_name'], df_mean['overall_score'])
plt.xlabel('Employer Name')
plt.ylabel('Predicted Overall Employer Score')
plt.show()