# Installing required packages

In [None]:
!pip install scipy
!pip install fastparquet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import scipy.stats as st
import numpy as np
import re
from collections import Counter

import os

### Before trying to load the data figure out where you are within the directories/folders

In [22]:
pwd

'/home/jovyan'

### Use the above cell to find out which folder you are in (hint: pwd or os.getcwd() --get Current Working Directory)

### 1. Now load the dataset

In [None]:
df = pd.read_parquet('/load/data/here/full/path/may/be/needed', engine='fastparquet')

In [None]:
print(df.head(), "\n")
print(df.columns)
print(df.info())
print(df.isnull().sum())

### 2. Clean the data as appropriate

In [None]:
numeric_cols = ["ProcessingTimeHours", "ErrorsDetected",
                "ClientSatisfaction", "MonthlyRevenue"]

for col in numeric_cols:
    df[col + "_clean"] = df[col].fillna(df[col].median())

df["Team_clean"] = df["Team"].fillna(df["Team"].mode()[0])
df["ClientComment_clean"] = df["ClientComment"].fillna("No comment provided")

print("Cleaning complete. Columns now include _clean versions.\n")

### 3. Summary Statistics fr key variables

In [None]:
summary_stats = df[[
    "ProcessingTimeHours_clean",
    "MonthlyRevenue_clean",
    "ClientSatisfaction_clean"
]].describe()

In [None]:
print(summary_stats, "\n")

### 4. Group Comparisons

In [None]:
print("Mean processing time by automation status:\n")
print(df.groupby("AutomationUsed")["ProcessingTimeHours_clean"].mean(), "\n")



In [None]:
print("Satisfaction by team:\n")
print(df.groupby("Team_clean")["ClientSatisfaction_clean"].agg(["mean", "std", "count"]), "\n")

### 5A. Visualisation: Histogram + KDE Overlay

In [None]:
plt.figure(figsize=(8,5))

data = df["ProcessingTimeHours_clean"].dropna()

# Histogram
plt.hist(data, bins=30, density=True, color="lightsteelblue", edgecolor="black", alpha=0.7)

# KDE overlay
kde = st.gaussian_kde(data)
x_vals = np.linspace(min(data), max(data), 300)
plt.plot(x_vals, kde(x_vals), color="darkblue", linewidth=2, label="Density Curve")

plt.title("Distribution of Processing Time (with KDE Overlay)")
plt.xlabel("Processing Time (hours)")
plt.ylabel("Density")
plt.legend()
plt.show()

### 5B. Revenue by Team – (with outliers)

In [None]:
plt.figure(figsize=(10,6))

teams = df["Team_clean"].unique()
raw_data = [df[df["Team_clean"] == t]["MonthlyRevenue_clean"].dropna() for t in teams]

plt.boxplot(raw_data, labels=teams, patch_artist=True)
plt.title("Monthly Revenue by Team (WITH Outliers)")
plt.xlabel("Team")
plt.ylabel("Monthly Revenue (£)")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.show()

## Quiz: is the above chart meaningful? What can be done to improve it?

### 5C. Revenue by Team – CLEANED (outliers removed + mean + median)

In [None]:
def remove_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series[(series >= lower) & (series <= upper)]

cleaned_data = [remove_outliers(df[df["Team_clean"] == t]["MonthlyRevenue_clean"].dropna()) for t in teams]

In [None]:
plt.figure(figsize=(10,6))

box = plt.boxplot(
    cleaned_data,
    labels=teams,
    notch=True,                 # shows median notch
    patch_artist=True,
    showmeans=True,
    meanprops={"marker": "o", "markerfacecolor": "red", "markeredgecolor": "black"},
)

# Colour the boxes
colors = ["lightblue", "lightgreen", "lightyellow", "lightpink", "lightgrey"]
for patch, color in zip(box["boxes"], colors[:len(teams)]):
    patch.set_facecolor(color)

plt.title("Monthly Revenue by Team (Outliers Removed, Mean + Median Shown)")
plt.xlabel("Team")
plt.ylabel("Monthly Revenue (£)")
plt.xticks(rotation=45)
plt.grid(axis="y", linestyle="--", alpha=0.6)   # gridlines for comparison
plt.show()

### 5D. Bar chart of satisfaction by automation

In [None]:
plt.figure(figsize=(6,4))
means = df.groupby("AutomationUsed")["ClientSatisfaction_clean"].mean()
plt.bar(["No Automation", "Automation"], means, color=["grey", "steelblue"])
plt.title("Satisfaction by Automation Status")
plt.ylabel("Average Satisfaction")
plt.grid(axis="y", linestyle="--", alpha=0.6)
plt.show()

### 5E. Scatterplot of satisfaction vs revenue

In [None]:
plt.figure(figsize=(6,4))
plt.scatter(df["ClientSatisfaction_clean"], df["MonthlyRevenue_clean"], alpha=0.6)
plt.title("Satisfaction vs Revenue")
plt.xlabel("Client Satisfaction")
plt.ylabel("Monthly Revenue (£)")
plt.grid(axis="both", linestyle="--", alpha=0.4)
plt.show()

### 6. Confidence Interval (95%) for Mean Processing Time

In [None]:
x = df["ProcessingTimeHours_clean"].dropna()
mean = x.mean()
sem = st.sem(x)
ci = st.t.interval(0.95, df=len(x)-1, loc=mean, scale=sem)

print("95% Confidence Interval for Mean Processing Time:")
print(f"Mean = {mean:.2f} hours")
print(f"95% CI = ({ci[0]:.2f}, {ci[1]:.2f})\n")

### 7. Optional: Confidence Intervals by Group

In [None]:
auto = df[df["AutomationUsed"] == 1]["ProcessingTimeHours_clean"].dropna()
non_auto = df[df["AutomationUsed"] == 0]["ProcessingTimeHours_clean"].dropna()

def ci_95(series):
    return st.t.interval(0.95, df=len(series)-1, loc=series.mean(), scale=st.sem(series))

print("95% CI for automated engagements:")
print(ci_95(auto), "\n")

print("95% CI for non-automated engagements:")
print(ci_95(non_auto), "\n")

### 8. Simple Text Analysis (Customer Comments)

In [None]:
# Lowercase + remove punctuation
clean_comments = df["ClientComment_clean"].str.lower().str.replace(r"[^\w\s]", "", regex=True)

# Tokenise
tokens = []
for comment in clean_comments:
    tokens.extend(comment.split())

# Remove common stopwords (minimal list to avoid extra packages)
stopwords = {"the", "and", "to", "with", "a", "of", "for", "in", "on", "is", "it", "was", "very"}
tokens = [t for t in tokens if t not in stopwords]

In [None]:
# Count word frequencies
word_counts = Counter(tokens)
print("\nMost common words in client comments:")
print(word_counts.most_common(10), "\n")


In [None]:

# Simple sentiment-style scoring (positive/negative word lists)
positive_words = {"satisfied", "happy", "fast", "good", "excellent"}
negative_words = {"delay", "slow", "error", "problem", "issue"}

df["CommentSentiment"] = clean_comments.apply(
    lambda text: sum(w in text for w in positive_words) -
                 sum(w in text for w in negative_words)
)

print("Sentiment score examples:")
print(df[["ClientComment_clean", "CommentSentiment"]].head(), "\n")