## Data Cleaning

#### Importing Libraries

In [1]:
import sys
sys.path.append('../')
import pandas as pd
from src.utils import check_missing_data
from src.descriptive_statistics import describe_numerical, summarize_categorical, summarize_time_series
from src.plots import display_summary_table, plot_bar_charts,plot_bar_chart, box_plots

#### Loading Data

In [2]:
df_data = pd.read_csv('../data/msgs_dataset.csv')

In [3]:
df_data.head()

Unnamed: 0,signature,channel_id,channel_name,msg_id,message,cleaned_message,date,msg_link,msg_from_peer,msg_from_id,...,contact_name,contact_userid,geo_type,lat,lng,venue_id,venue_type,venue_title,venue_address,venue_provider
0,msg_iteration.0.user.DoctorsET.post.864,1102021238,DoctorsET,864,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,2023-12-18 17:04:02+00:00,https://t.me/DoctorsET/864,,,...,,,,,,,,,,
1,msg_iteration.1.user.DoctorsET.post.863,1102021238,DoctorsET,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,2023-11-03 16:14:39+00:00,https://t.me/DoctorsET/863,,,...,,,,,,,,,,
2,msg_iteration.2.user.DoctorsET.post.862,1102021238,DoctorsET,862,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,ሞት በስኳር ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ይ...,2023-10-02 16:37:39+00:00,https://t.me/DoctorsET/862,,,...,,,,,,,,,,
3,msg_iteration.3.user.DoctorsET.post.861,1102021238,DoctorsET,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ? ሙሉ ቪ...,2023-09-16 07:54:32+00:00,https://t.me/DoctorsET/861,,,...,,,,,,,,,,
4,msg_iteration.4.user.DoctorsET.post.860,1102021238,DoctorsET,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,2023-09-01 16:16:15+00:00,https://t.me/DoctorsET/860,,,...,,,,,,,,,,


#### Data Quality Assessement

In [4]:
check_missing_data(df_data)

Unnamed: 0,Column Name,Missing Values,Percentage Missing
4,message,1171,15.617498
5,cleaned_message,1171,15.617498
8,msg_from_peer,7498,100.0
9,msg_from_id,7498,100.0
14,forward_msg_from_peer_type,7150,95.358762
15,forward_msg_from_peer_id,7150,95.358762
16,forward_msg_from_peer_name,7182,95.785543
17,forward_msg_date,7085,94.491864
18,forward_msg_date_string,7085,94.491864
19,forward_msg_link,7182,95.785543


#### Data Cleaning

In [5]:
def drop_missing_data(df, threshold=90):
    """
    Drop columns from a DataFrame that have missing values over a certain threshold.
    
    Parameters:
    - df: Pandas DataFrame
    - threshold: float, The percentage threshold for missing values. Columns with missing values over this threshold will be dropped.
    
    Returns:
    - DataFrame: DataFrame after dropping the columns
    """
    missing_data = df.isnull().sum()
    missing_data_percentage = (missing_data / len(df)) * 100
    
    # Identify columns to drop
    cols_to_drop = missing_data_percentage[missing_data_percentage > threshold].index
    
    # Drop identified columns
    df_cleaned = df.drop(cols_to_drop, axis=1)
    
    return df_cleaned

df_cleaned = drop_missing_data(df_data, threshold=90)

In [6]:
df_cleaned.head()

Unnamed: 0,signature,channel_id,channel_name,msg_id,message,cleaned_message,date,msg_link,views,number_replies,number_forwards,is_forward,is_reply,contains_media,media_type,has_url
0,msg_iteration.0.user.DoctorsET.post.864,1102021238,DoctorsET,864,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,2023-12-18 17:04:02+00:00,https://t.me/DoctorsET/864,5711,0,3,0,0,1,MessageMediaWebPage,1
1,msg_iteration.1.user.DoctorsET.post.863,1102021238,DoctorsET,863,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,2023-11-03 16:14:39+00:00,https://t.me/DoctorsET/863,8404,0,5,0,0,1,MessageMediaWebPage,1
2,msg_iteration.2.user.DoctorsET.post.862,1102021238,DoctorsET,862,ሞት በስኳር \n\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀን...,ሞት በስኳር ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይቀንሰው ይ...,2023-10-02 16:37:39+00:00,https://t.me/DoctorsET/862,10862,0,56,0,0,1,MessageMediaWebPage,1
3,msg_iteration.3.user.DoctorsET.post.861,1102021238,DoctorsET,861,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\n\nሙ...,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ? ሙሉ ቪ...,2023-09-16 07:54:32+00:00,https://t.me/DoctorsET/861,11284,0,8,0,0,1,MessageMediaWebPage,1
4,msg_iteration.4.user.DoctorsET.post.860,1102021238,DoctorsET,860,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,2023-09-01 16:16:15+00:00,https://t.me/DoctorsET/860,13091,0,11,0,0,1,MessageMediaWebPage,1


In [7]:
check_missing_data(df_cleaned)

Unnamed: 0,Column Name,Missing Values,Percentage Missing
4,message,1171,15.617498
5,cleaned_message,1171,15.617498
14,media_type,571,7.615364
