## EDA for Customer Experience
**Steps:**
- Load up the scraped dataset
- Preprocess and clean the data
- Visualize the results 

In [28]:
# import necessary libraries
import pandas as pd 
import numpy as np
import seaborn as sns
from Scripts.config import DATA_PATHS

### Load the Dataset

In [27]:
# Load the Dataset to get reviews for EDA
import sys
import os

df = pd.read_csv("../data/raw/raw_reviews.csv")

In [8]:
df

Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99
...,...,...,...,...,...,...,...
1195,good,5,2025-05-09,DASHEN,,google_play,b3c8405c-96a7-4b5e-884c-76c97c530c34
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,,google_play,e1c1214a-8bc1-45db-bc49-3d51dddc6b88
1197,its the best ever,5,2025-05-09,DASHEN,,google_play,8d1d472b-2bae-4749-b089-5632108ade02
1198,nice,5,2025-05-08,DASHEN,,google_play,749851ed-72d7-4f1c-8e5e-27dd822b5008


In [None]:
# Drop rows with no review text and no rating
df = df.dropna(subset=["review_text"], how="all")
df["review_text"] = df["review_text"].astype(str).str.strip()



In [10]:
df

Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99
...,...,...,...,...,...,...,...
1195,good,5,2025-05-09,DASHEN,,google_play,b3c8405c-96a7-4b5e-884c-76c97c530c34
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,,google_play,e1c1214a-8bc1-45db-bc49-3d51dddc6b88
1197,its the best ever,5,2025-05-09,DASHEN,,google_play,8d1d472b-2bae-4749-b089-5632108ade02
1198,nice,5,2025-05-08,DASHEN,,google_play,749851ed-72d7-4f1c-8e5e-27dd822b5008


In [12]:
# Normalize Dates
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['date'] = df['date'].dt.date

In [13]:
df.head()

Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99


In [15]:
# Remove duplicates by review_id or by text
if "review_id" in df.columns:
    df = df.drop_duplicates(subset=["review_id"], keep="first")
df = df.drop_duplicates(subset=["review_text"], keep="first")

df


Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99
...,...,...,...,...,...,...,...
1192,"To be honest, best banking and lifestyle app i...",5,2025-05-12,DASHEN,,google_play,d61a5b09-39d5-4502-b270-15c6ad04f19c
1194,"A must have, seamless, all in one digital plat...",5,2025-05-12,DASHEN,,google_play,c3afc51f-9e2b-4b3d-b155-0339a9ce0f20
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,,google_play,e1c1214a-8bc1-45db-bc49-3d51dddc6b88
1197,its the best ever,5,2025-05-09,DASHEN,,google_play,8d1d472b-2bae-4749-b089-5632108ade02


In [16]:
# Fill missing bank values as Unknown
df["bank"] = df.get("bank", pd.Series([None]*len(df))).fillna("Unknown")


In [17]:
df

Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99
...,...,...,...,...,...,...,...
1192,"To be honest, best banking and lifestyle app i...",5,2025-05-12,DASHEN,,google_play,d61a5b09-39d5-4502-b270-15c6ad04f19c
1194,"A must have, seamless, all in one digital plat...",5,2025-05-12,DASHEN,,google_play,c3afc51f-9e2b-4b3d-b155-0339a9ce0f20
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,,google_play,e1c1214a-8bc1-45db-bc49-3d51dddc6b88
1197,its the best ever,5,2025-05-09,DASHEN,,google_play,8d1d472b-2bae-4749-b089-5632108ade02


In [18]:
 # Basic quality check
total = len(df)
missing = df["review_text"].isna().sum()
print(f"Total rows after cleaning: {total}, missing texts: {missing}")
df

Total rows after cleaning: 968, missing texts: 0


Unnamed: 0,review_text,rating,date,bank,app_id,source,review_id
0,CBE ይለያል።,5,2025-11-29,CBE,,google_play,cb37b096-e071-4f0f-a8fd-067b7d71706d
1,it's special for me,5,2025-11-29,CBE,,google_play,70f504ff-daed-40d9-9c89-cc49a95ef659
2,Make it user friendly.,2,2025-11-29,CBE,,google_play,28f229b5-0026-41b9-a1eb-b76e74736f63
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,,google_play,68d8daea-db47-4e23-a692-755173dea983
4,good app,5,2025-11-28,CBE,,google_play,ee0dbb0e-4eb0-47b5-9874-c37877493f99
...,...,...,...,...,...,...,...
1192,"To be honest, best banking and lifestyle app i...",5,2025-05-12,DASHEN,,google_play,d61a5b09-39d5-4502-b270-15c6ad04f19c
1194,"A must have, seamless, all in one digital plat...",5,2025-05-12,DASHEN,,google_play,c3afc51f-9e2b-4b3d-b155-0339a9ce0f20
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,,google_play,e1c1214a-8bc1-45db-bc49-3d51dddc6b88
1197,its the best ever,5,2025-05-09,DASHEN,,google_play,8d1d472b-2bae-4749-b089-5632108ade02


In [19]:
# drop app_id and review_id columns
df = df.drop(columns=["app_id", "review_id"], errors="ignore")

In [20]:
df

Unnamed: 0,review_text,rating,date,bank,source
0,CBE ይለያል።,5,2025-11-29,CBE,google_play
1,it's special for me,5,2025-11-29,CBE,google_play
2,Make it user friendly.,2,2025-11-29,CBE,google_play
3,maaliif daddafee install gaafata,3,2025-11-28,CBE,google_play
4,good app,5,2025-11-28,CBE,google_play
...,...,...,...,...,...
1192,"To be honest, best banking and lifestyle app i...",5,2025-05-12,DASHEN,google_play
1194,"A must have, seamless, all in one digital plat...",5,2025-05-12,DASHEN,google_play
1196,Amazing app super easy to use and best design....,5,2025-05-09,DASHEN,google_play
1197,its the best ever,5,2025-05-09,DASHEN,google_play


In [26]:
# output cleaned data
df.to_csv(f"../{DATA_PATHS['processed_reviews']}", index=False)