In [4]:
pip install pandas numpy matplotlib seaborn scikit-learn textblob transformers torch

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pandas (from versions: none)
ERROR: No matching distribution found for pandas

[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import pandas as pd
import os

# Create folder to save images
os.makedirs("visuals", exist_ok=True)

# Load the dataset
df = pd.read_csv("test.csv")

# View the structure
print("Columns:", df.columns.tolist())
print("\nSample data:")
print(df.head())


In [None]:
# Convert time to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')  # handle invalid dates
df = df.dropna(subset=['date'])  # remove rows with invalid dates

# Extract month for grouping
df['month'] = df['date'].dt.to_period("M")

print("Time column converted and month extracted.")


In [None]:
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns

# Function to classify sentiment
def classify_sentiment(text):
    polarity = TextBlob(str(text)).sentiment.polarity
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment classification
df['sentiment'] = df['body'].apply(classify_sentiment)

# Visualization 1: Overall Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=df, palette='Set2')
plt.title("Sentiment Distribution", fontsize=16)
plt.xlabel("Sentiment")
plt.ylabel("Message Count")
plt.tight_layout()
plt.savefig("visuals/sentiment_distribution.png")
plt.show()


In [None]:
# Group by month and sentiment
monthly_sentiment = df.groupby(['month', 'sentiment']).size().unstack().fillna(0)

# Visualization 2: Monthly Sentiment Trend (Stacked Bar)
monthly_sentiment.plot(kind='bar', stacked=True, figsize=(14, 6), colormap='Paired')
plt.title('Monthly Sentiment Trends')
plt.xlabel('Month')
plt.ylabel('Message Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("visuals/monthly_sentiment_trend.png")
plt.show()


In [None]:
# Map sentiment to scores
score_map = {'Positive': 1, 'Neutral': 0, 'Negative': -1}
df['score'] = df['sentiment'].map(score_map)

# Aggregate monthly scores per employee
monthly_scores = df.groupby(['from', 'month'])['score'].sum().reset_index()
monthly_scores.rename(columns={'from': 'employee', 'score': 'monthly_score'}, inplace=True)

# Visualization: Distribution of Monthly Scores
plt.figure(figsize=(10, 6))
sns.histplot(monthly_scores['monthly_score'], bins=30, kde=True, color='skyblue')
plt.title("Distribution of Monthly Sentiment Scores by Employee", fontsize=16)
plt.xlabel("Monthly Sentiment Score")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("visuals/monthly_score_distribution.png")
plt.show()


In [None]:
top_employees = {}

# For each month, find top 3 positive and negative employees
for month in monthly_scores['month'].unique():
    month_data = monthly_scores[monthly_scores['month'] == month]
    top_pos = month_data.sort_values(by='monthly_score', ascending=False).head(3)
    top_neg = month_data.sort_values(by='monthly_score').head(3)
    
    top_employees[str(month)] = {
        'Top 3 Positive': top_pos[['employee', 'monthly_score']],
        'Top 3 Negative': top_neg[['employee', 'monthly_score']]
    }

# Display for example the latest month
latest_month = str(monthly_scores['month'].max())
print(f"\nTop Employees for {latest_month}:")

print("\nTop 3 Positive Employees:")
print(top_employees[latest_month]['Top 3 Positive'])

print("\nTop 3 Negative Employees:")
print(top_employees[latest_month]['Top 3 Negative'])


In [None]:
# Filter only negative messages
negative_msgs = df[df['sentiment'] == 'Negative'][['from', 'date']]

flight_risk = set()

# Check rolling 4 negative messages in 30-day window
for emp in negative_msgs['from'].unique():
    emp_dates = negative_msgs[negative_msgs['from'] == emp]['date'].sort_values().reset_index(drop=True)
    for i in range(len(emp_dates) - 3):
        if (emp_dates[i+3] - emp_dates[i]).days <= 30:
            flight_risk.add(emp)
            break

print("\n⚠️ Flight Risk Employees:")
print(flight_risk)


In [None]:
import matplotlib.dates as mdates

plt.figure(figsize=(12, 6))
for i, emp in enumerate(flight_risk):
    emp_neg_dates = negative_msgs[negative_msgs['from'] == emp]['date']
    plt.scatter(emp_neg_dates, [i]*len(emp_neg_dates), label=emp, s=100, alpha=0.7)

plt.yticks(range(len(flight_risk)), list(flight_risk))
plt.title("Timeline of Negative Messages for Flight-Risk Employees")
plt.xlabel("Date")
plt.ylabel("Employee")
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("visuals/flight_risk_timeline.png")
plt.show()


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Convert month to integer format YYYYMM for regression
monthly_scores['month_int'] = monthly_scores['month'].astype(str).str.replace('-', '').astype(int)

X = monthly_scores[['month_int']]
y = monthly_scores['monthly_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation metrics
print(f"\nLinear Regression Results:\nMean Squared Error: {mean_squared_error(y_test, y_pred):.3f}")
print(f"R² Score: {r2_score(y_test, y_pred):.3f}")

# Visualization: Actual vs Predicted
plt.figure(figsize=(10,6))
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(X_test, y_pred, color='red', label='Predicted')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.title('Sentiment Score Prediction: Actual vs Predicted')
plt.xlabel('Month (YYYYMM)')
plt.ylabel('Monthly Sentiment Score')
plt.legend()
plt.tight_layout()
plt.savefig("visuals/sentiment_score_prediction.png")
plt.show()


In [None]:
with pd.ExcelWriter("Employee_Sentiment_Report.xlsx") as writer:
    df.to_excel(writer, sheet_name='Raw_Data', index=False)
    monthly_sentiment.to_excel(writer, sheet_name='Monthly_Sentiment')
    monthly_scores.to_excel(writer, sheet_name='Monthly_Scores', index=False)

    # Save Top Employees in separate sheets
    for month, data in top_employees.items():
        data['Top 3 Positive'].to_excel(writer, sheet_name=f'{month}_Top_Pos', index=False)
        data['Top 3 Negative'].to_excel(writer, sheet_name=f'{month}_Top_Neg', index=False)

print("✅ Report exported to Employee_Sentiment_Report.xlsx")
