In [1]:
import numpy as np
import pandas as pd
import category_encoders as ce
import sys, os
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from mpl_toolkits.mplot3d import Axes3D

In [2]:
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from file_handler import FileHandler
from df_selector import *
from df_cleaner import *
from df_visualizer import *
from app_logger import App_Logger

In [3]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)

# Reading Data

In [4]:
# create a FileHandler object
file_handler = FileHandler()

In [5]:
# reading the store csv file
df = file_handler.read_csv("../data/yetenaweg.csv")
df.head(10)

Unnamed: 0,signature,channel_id,channel_name,msg_id,message,cleaned_message,date,msg_link,msg_from_peer,msg_from_id,views,number_replies,number_forwards,is_forward,forward_msg_from_peer_type,forward_msg_from_peer_id,forward_msg_from_peer_name,forward_msg_date,forward_msg_date_string,forward_msg_link,is_reply,reply_to_msg_id,reply_msg_link,contains_media,media_type,has_url,url,domain,url_title,url_description,document_type,document_id,document_video_duration,document_filename,poll_id,poll_question,poll_total_voters,poll_results,contact_phone_number,contact_name,contact_userid,geo_type,lat,lng,venue_id,venue_type,venue_title,venue_address,venue_provider
0,msg_iteration.0.user.yetenaweg.post.1087,1447066276,yetenaweg,1087,,,2024-06-14 16:33:19+00:00,https://t.me/yetenaweg/1087,,,298,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
1,msg_iteration.1.user.yetenaweg.post.1086,1447066276,yetenaweg,1086,,,2024-06-14 16:33:19+00:00,https://t.me/yetenaweg/1086,,,298,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
2,msg_iteration.2.user.yetenaweg.post.1085,1447066276,yetenaweg,1085,,,2024-06-14 16:33:18+00:00,https://t.me/yetenaweg/1085,,,295,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
3,msg_iteration.3.user.yetenaweg.post.1084,1447066276,yetenaweg,1084,·ã∞·àù ·àà·åç·à∞·ãç ·àÖ·ã≠·ãà·âµ ·àµ·àà·â≥·ã∞·åâ¬† ·ä•·äì·àò·à∞·åç·äì·àà·äï·ç¢\n\n‚úç‚úç ·â†·ã®·ä†·àò·â± ·â†·çà·à®·äï...,·ã∞·àù ·àà·åç·à∞·ãç ·àÖ·ã≠·ãà·âµ ·àµ·àà·â≥·ã∞·åâ ·ä•·äì·àò·à∞·åç·äì·àà·äï·ç¢ ‚úç‚úç ·â†·ã®·ä†·àò·â± ·â†·çà·à®·äï·åÜ·âΩ ·à∞...,2024-06-14 16:33:18+00:00,https://t.me/yetenaweg/1084,,,296,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
4,msg_iteration.5.user.yetenaweg.post.1082,1447066276,yetenaweg,1082,·å•·ã´·âÑ·ã´·âΩ·àÅ·äï ·ä†·åã·à©·äï,·å•·ã´·âÑ·ã´·âΩ·àÅ·äï ·ä†·åã·à©·äï,2024-06-09 15:28:37+00:00,https://t.me/yetenaweg/1082,,,1164,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
5,msg_iteration.6.user.yetenaweg.post.1081,1447066276,yetenaweg,1081,https://t.me/yetenaweg?livestream\n\n·àµ·àà ·ä©·àã·àä·âµ ·å§...,https://t.me/yetenaweg?livestream ·àµ·àà ·ä©·àã·àä·âµ ·å§·äì ·ç£...,2024-06-09 15:05:18+00:00,https://t.me/yetenaweg/1081,,,1087,0,1,0,,,,,,,0,,,1,MessageMediaWebPage,1,https://t.me/yetenaweg?livestream,t.me,·ã®·å§·äì ·ãà·åç - ·ã®·å§·äì ·àò·à®·åÉ,·ã≠·àÖ ·ã®·å§·äì ·ãà·åç ·äê·ãç·ç¢\nYetenaweg.com \n·àµ·àà ·å§·äï·äê·â≥·âΩ·àÅ ·àõ·ãà·âÖ ·ã®...,,,,,,,,,,,,,,,,,,,
6,msg_iteration.8.user.yetenaweg.post.1079,1447066276,yetenaweg,1079,,,2024-06-06 07:51:31+00:00,https://t.me/yetenaweg/1079,,,1519,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
7,msg_iteration.9.user.yetenaweg.post.1078,1447066276,yetenaweg,1078,üì£·â†·ãö·àÖ ·ä•·àÅ·ãµ ·â†·â¥·àå·åç·à´·àù ·âÄ·å•·â≥ ·àµ·à≠·å≠·âµ ·ã®·ä©·àã·àä·âµ ·â†·àΩ·â≥ ·ä≠·âµ·âµ·àç ·ä•·äì ·ã®·ã≥·ã´...,üì£·â†·ãö·àÖ ·ä•·àÅ·ãµ ·â†·â¥·àå·åç·à´·àù ·âÄ·å•·â≥ ·àµ·à≠·å≠·âµ ·ã®·ä©·àã·àä·âµ ·â†·àΩ·â≥ ·ä≠·âµ·âµ·àç ·ä•·äì ·ã®·ã≥·ã´...,2024-06-06 07:51:30+00:00,https://t.me/yetenaweg/1078,,,1499,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
8,msg_iteration.10.user.yetenaweg.post.1077,1447066276,yetenaweg,1077,https://www.clubhouse.com/room/xlOVk34E?utm_me...,https://www.clubhouse.com/room/xlOVk34E?utm_me...,2024-06-05 20:27:37+00:00,https://t.me/yetenaweg/1077,,,1310,0,4,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,
9,msg_iteration.12.user.yetenaweg.post.1075,1447066276,yetenaweg,1075,,,2024-05-31 17:24:31+00:00,https://t.me/yetenaweg/1075,,,2078,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,,,,,,,,,,,,


# General Statistics

In [6]:
df.size

45766

In [7]:
df.shape

(934, 49)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934 entries, 0 to 933
Data columns (total 49 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   signature                   934 non-null    object 
 1   channel_id                  934 non-null    int64  
 2   channel_name                934 non-null    object 
 3   msg_id                      934 non-null    int64  
 4   message                     694 non-null    object 
 5   cleaned_message             694 non-null    object 
 6   date                        934 non-null    object 
 7   msg_link                    934 non-null    object 
 8   msg_from_peer               0 non-null      float64
 9   msg_from_id                 0 non-null      float64
 10  views                       934 non-null    int64  
 11  number_replies              934 non-null    int64  
 12  number_forwards             934 non-null    int64  
 13  is_forward                  934 non

# Missing Values

In [10]:
percent_missing_values(df)

The dataset contains 64.96 % missing values.


In [11]:
missing_df = missing_values_table(df)

Your selected dataframe has 49 columns.
There are 36 columns that have missing values.


In [12]:
df = df.dropna(how='all',axis=1) 

In [13]:
df.shape

(934, 36)

In [14]:
missing_df

Unnamed: 0,Missing Values,% of Total Values,Dtype
venue_provider,934,100.0,float64
contact_userid,934,100.0,float64
msg_from_peer,934,100.0,float64
msg_from_id,934,100.0,float64
venue_address,934,100.0,float64
venue_title,934,100.0,float64
venue_type,934,100.0,float64
venue_id,934,100.0,float64
contact_phone_number,934,100.0,float64
contact_name,934,100.0,float64


In [15]:
count_missing_rows(df)

934 rows(100.0%) contain atleast one missing value.


In [16]:
# dataframe containing the missing rows for columns in group 1
group1_df = df[df['forward_msg_link'].isna()]
group1_df.head(10)

Unnamed: 0,signature,channel_id,channel_name,msg_id,message,cleaned_message,date,msg_link,views,number_replies,number_forwards,is_forward,forward_msg_from_peer_type,forward_msg_from_peer_id,forward_msg_from_peer_name,forward_msg_date,forward_msg_date_string,forward_msg_link,is_reply,reply_to_msg_id,reply_msg_link,contains_media,media_type,has_url,url,domain,url_title,url_description,document_type,document_id,document_video_duration,document_filename,poll_id,poll_question,poll_total_voters,poll_results
0,msg_iteration.0.user.yetenaweg.post.1087,1447066276,yetenaweg,1087,,,2024-06-14 16:33:19+00:00,https://t.me/yetenaweg/1087,298,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
1,msg_iteration.1.user.yetenaweg.post.1086,1447066276,yetenaweg,1086,,,2024-06-14 16:33:19+00:00,https://t.me/yetenaweg/1086,298,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
2,msg_iteration.2.user.yetenaweg.post.1085,1447066276,yetenaweg,1085,,,2024-06-14 16:33:18+00:00,https://t.me/yetenaweg/1085,295,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
3,msg_iteration.3.user.yetenaweg.post.1084,1447066276,yetenaweg,1084,·ã∞·àù ·àà·åç·à∞·ãç ·àÖ·ã≠·ãà·âµ ·àµ·àà·â≥·ã∞·åâ¬† ·ä•·äì·àò·à∞·åç·äì·àà·äï·ç¢\n\n‚úç‚úç ·â†·ã®·ä†·àò·â± ·â†·çà·à®·äï...,·ã∞·àù ·àà·åç·à∞·ãç ·àÖ·ã≠·ãà·âµ ·àµ·àà·â≥·ã∞·åâ ·ä•·äì·àò·à∞·åç·äì·àà·äï·ç¢ ‚úç‚úç ·â†·ã®·ä†·àò·â± ·â†·çà·à®·äï·åÜ·âΩ ·à∞...,2024-06-14 16:33:18+00:00,https://t.me/yetenaweg/1084,296,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
4,msg_iteration.5.user.yetenaweg.post.1082,1447066276,yetenaweg,1082,·å•·ã´·âÑ·ã´·âΩ·àÅ·äï ·ä†·åã·à©·äï,·å•·ã´·âÑ·ã´·âΩ·àÅ·äï ·ä†·åã·à©·äï,2024-06-09 15:28:37+00:00,https://t.me/yetenaweg/1082,1164,0,0,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
5,msg_iteration.6.user.yetenaweg.post.1081,1447066276,yetenaweg,1081,https://t.me/yetenaweg?livestream\n\n·àµ·àà ·ä©·àã·àä·âµ ·å§...,https://t.me/yetenaweg?livestream ·àµ·àà ·ä©·àã·àä·âµ ·å§·äì ·ç£...,2024-06-09 15:05:18+00:00,https://t.me/yetenaweg/1081,1087,0,1,0,,,,,,,0,,,1,MessageMediaWebPage,1,https://t.me/yetenaweg?livestream,t.me,·ã®·å§·äì ·ãà·åç - ·ã®·å§·äì ·àò·à®·åÉ,·ã≠·àÖ ·ã®·å§·äì ·ãà·åç ·äê·ãç·ç¢\nYetenaweg.com \n·àµ·àà ·å§·äï·äê·â≥·âΩ·àÅ ·àõ·ãà·âÖ ·ã®...,,,,,,,,
6,msg_iteration.8.user.yetenaweg.post.1079,1447066276,yetenaweg,1079,,,2024-06-06 07:51:31+00:00,https://t.me/yetenaweg/1079,1519,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
7,msg_iteration.9.user.yetenaweg.post.1078,1447066276,yetenaweg,1078,üì£·â†·ãö·àÖ ·ä•·àÅ·ãµ ·â†·â¥·àå·åç·à´·àù ·âÄ·å•·â≥ ·àµ·à≠·å≠·âµ ·ã®·ä©·àã·àä·âµ ·â†·àΩ·â≥ ·ä≠·âµ·âµ·àç ·ä•·äì ·ã®·ã≥·ã´...,üì£·â†·ãö·àÖ ·ä•·àÅ·ãµ ·â†·â¥·àå·åç·à´·àù ·âÄ·å•·â≥ ·àµ·à≠·å≠·âµ ·ã®·ä©·àã·àä·âµ ·â†·àΩ·â≥ ·ä≠·âµ·âµ·àç ·ä•·äì ·ã®·ã≥·ã´...,2024-06-06 07:51:30+00:00,https://t.me/yetenaweg/1078,1499,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
8,msg_iteration.10.user.yetenaweg.post.1077,1447066276,yetenaweg,1077,https://www.clubhouse.com/room/xlOVk34E?utm_me...,https://www.clubhouse.com/room/xlOVk34E?utm_me...,2024-06-05 20:27:37+00:00,https://t.me/yetenaweg/1077,1310,0,4,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,
9,msg_iteration.12.user.yetenaweg.post.1075,1447066276,yetenaweg,1075,,,2024-05-31 17:24:31+00:00,https://t.me/yetenaweg/1075,2078,0,2,0,,,,,,,0,,,1,MessageMediaPhoto,0,,,,,,,,,,,,


In [17]:
# all unique values for each column
unique_values_df(group1_df)

Unnamed: 0,Column,Unique values
0,signature,"[msg_iteration.0.user.yetenaweg.post.1087, msg..."
1,channel_id,[1447066276]
2,channel_name,[yetenaweg]
3,msg_id,"[1087, 335, 349, 346, 345, 344, 343, 342, 341,..."
4,message,[#·ã®·å§·äì·ãà·åç·äï·ã≠·å†·ã≠·âÅ\n\n·â†·ã®·ä´·â≤·âµ 30/2015 ·ä®·àö·ä®·â†·à®·ãç ·ã®·ä†·àà·àù ·ã®·ä©·àã·àä...
5,cleaned_message,[#·ã®·å§·äì·ãà·åç·äï·ã≠·å†·ã≠·âÅ ·â†·ã®·ä´·â≤·âµ 30/2015 ·ä®·àö·ä®·â†·à®·ãç ·ã®·ä†·àà·àù ·ã®·ä©·àã·àä·âµ ·âÄ...
6,date,"[2020-02-20 01:39:02+00:00, 2020-03-18 10:57:4..."
7,msg_link,"[https://t.me/yetenaweg/1087, https://t.me/yet..."
8,views,"[1442, 1726, 397, 1279, 1222, 1155, 1330, 399,..."
9,number_replies,"[0, 1, 2, 4, 3, 5, 6, 11, 8, 21, 15, 13, 7]"


In [18]:
fix_missing_value(df, ['forward_msg_from_peer_type', 'forward_msg_from_peer_id','forward_msg_from_peer_name','forward_msg_date','forward_msg_date_string','forward_msg_link'], 0)

863 missing values in the column forward_msg_from_peer_type have been replaced by 0.
863 missing values in the column forward_msg_from_peer_id have been replaced by 0.
884 missing values in the column forward_msg_from_peer_name have been replaced by 0.
805 missing values in the column forward_msg_date have been replaced by 0.
805 missing values in the column forward_msg_date_string have been replaced by 0.
884 missing values in the column forward_msg_link have been replaced by 0.


In [19]:
fix_missing_value(df, ['url', 'domain','url_title','url_description','document_type','document_id','document_video_duration','document_filename','media_type','poll_id','poll_question','poll_total_voters','poll_results'], 0)

704 missing values in the column url have been replaced by 0.
704 missing values in the column domain have been replaced by 0.
709 missing values in the column url_title have been replaced by 0.
708 missing values in the column url_description have been replaced by 0.
877 missing values in the column document_type have been replaced by 0.
877 missing values in the column document_id have been replaced by 0.
909 missing values in the column document_video_duration have been replaced by 0.
897 missing values in the column document_filename have been replaced by 0.
99 missing values in the column media_type have been replaced by 0.
928 missing values in the column poll_id have been replaced by 0.
928 missing values in the column poll_question have been replaced by 0.
928 missing values in the column poll_total_voters have been replaced by 0.
928 missing values in the column poll_results have been replaced by 0.


In [27]:
fix_missing_value(df, ['msg_link', 'message','msg_id','channel_name','cleaned_message'], 0)

0 missing values in the column msg_link have been replaced by 0.
240 missing values in the column message have been replaced by 0.
0 missing values in the column msg_id have been replaced by 0.
0 missing values in the column channel_name have been replaced by 0.
0 missing values in the column cleaned_message have been replaced by 0.


In [36]:
percent_missing_values(df)

The dataset contains 0.0 % missing values.


# Data Types

In [37]:
show_cols_mixed_dtypes(df)

                        Column      Data type
0                      message  mixed-integer
1              cleaned_message  mixed-integer
2   forward_msg_from_peer_type  mixed-integer
3   forward_msg_from_peer_name  mixed-integer
4             forward_msg_date  mixed-integer
5      forward_msg_date_string  mixed-integer
6             forward_msg_link  mixed-integer
7               reply_msg_link  mixed-integer
8                   media_type  mixed-integer
9                          url  mixed-integer
10                      domain  mixed-integer
11                   url_title  mixed-integer
12             url_description  mixed-integer
13               document_type  mixed-integer
14           document_filename  mixed-integer
15               poll_question  mixed-integer
16                poll_results  mixed-integer


In [38]:
df.dtypes

signature                                  object
channel_id                                  int64
channel_name                               object
msg_id                                      int64
message                                    object
cleaned_message                            object
date                          datetime64[ns, UTC]
msg_link                                   object
views                                       int64
number_replies                              int64
number_forwards                             int64
is_forward                                  int64
forward_msg_from_peer_type                 object
forward_msg_from_peer_id                  float64
forward_msg_from_peer_name                 object
forward_msg_date                           object
forward_msg_date_string                    object
forward_msg_link                           object
is_reply                                    int64
reply_to_msg_id                           float64


In [39]:
# get the columns with object data type
string_columns = df.select_dtypes(include='object').columns.tolist()
string_columns

['signature',
 'channel_name',
 'message',
 'cleaned_message',
 'msg_link',
 'forward_msg_from_peer_type',
 'forward_msg_from_peer_name',
 'forward_msg_date',
 'forward_msg_date_string',
 'forward_msg_link',
 'reply_msg_link',
 'media_type',
 'url',
 'domain',
 'url_title',
 'url_description',
 'document_type',
 'document_filename',
 'poll_question',
 'poll_results']

In [40]:
convert_to_string(df, string_columns)

In [34]:
convert_to_datetime(df, ['date'])

In [41]:
show_cols_mixed_dtypes(df)

None of the columns contain mixed types.


In [42]:
df.dtypes

signature                          string[python]
channel_id                                  int64
channel_name                       string[python]
msg_id                                      int64
message                            string[python]
cleaned_message                    string[python]
date                          datetime64[ns, UTC]
msg_link                           string[python]
views                                       int64
number_replies                              int64
number_forwards                             int64
is_forward                                  int64
forward_msg_from_peer_type         string[python]
forward_msg_from_peer_id                  float64
forward_msg_from_peer_name         string[python]
forward_msg_date                   string[python]
forward_msg_date_string            string[python]
forward_msg_link                   string[python]
is_reply                                    int64
reply_to_msg_id                           float64


# Duplicates

In [44]:
# search for duplicate rows and drop them
drop_duplicates(df)

No duplicate rows were found.


In [45]:
df.duplicated(subset=['signature']).all()

False

In [46]:
file_handler.to_csv(df, '../data/new/yetenaweg.csv')