In [None]:
%%capture
# !pip install dataprep
# !pip install wordcloud-fa

# !pip install transformers
# !pip install datasets
# !pip install hazm
# !pip install clean-text[gpl]
# !pip install -U kaleido

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from wordcloud_fa import WordCloudFa

import hazm
from cleantext import clean

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.utils import shuffle

import os
import re
import json
import ast
import copy
import collections
import re

In [2]:
digikala_path = 'data/'
digi_data = pd.read_csv(digikala_path + 'digikala_total.csv')

In [3]:
digi_data.head(3)

Unnamed: 0,product_id,product_title,title_en,user_id,likes,dislikes,verification_status,recommend,title,comment,advantages,disadvantages
0,3692,ماوس بی‌سیم لاجیتک مدل M325,IT,989472,0,0,verified,\N,,واقعا عالیه. من که ازش خیلی راضیم,,
1,90213,شارژر همراه شیاومی مدل NDY-02-AN با ظرفیت 1000...,AC,3862150,4,1,verified,recommended,واقعاً عالیه,سلام، قبل اینکه نظرم رو بگم میخواستم به یک موض...,"[""عمر طولانی\r"",""افت بسیار کم میزان شارژ\r"",""ا...","[""ندارد""]"
2,59473,یدک پولیشر میکروفایبر مهسان مدل 20119,HW,626843,1,0,verified,not_recommended,خیلی سخت حوله اش در میاد,گیره های فلزی خیلی سخت تا میشوند و لذا حوله را...,,


In [6]:
digi_data.recommend.value_counts()

recommended        36972
\N                 36382
not_recommended    16110
no_idea            10536
Name: recommend, dtype: int64

In [4]:
df = digi_data[digi_data.recommend != '\\N'].copy()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63618 entries, 1 to 99999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           63618 non-null  int64 
 1   product_title        63618 non-null  object
 2   title_en             63618 non-null  object
 3   user_id              63618 non-null  int64 
 4   likes                63618 non-null  int64 
 5   dislikes             63618 non-null  int64 
 6   verification_status  63618 non-null  object
 7   recommend            63618 non-null  object
 8   title                61624 non-null  object
 9   comment              63586 non-null  object
 10  advantages           26000 non-null  object
 11  disadvantages        22168 non-null  object
dtypes: int64(4), object(8)
memory usage: 6.3+ MB


In [7]:
df = df[['title_en', 'likes', 'dislikes', 'verification_status', 'recommend', 'comment']].copy()
df = df.rename(columns={"recommend": "label"})

In [8]:
df.head()

Unnamed: 0,title_en,likes,dislikes,verification_status,label,comment
1,AC,4,1,verified,recommended,سلام، قبل اینکه نظرم رو بگم میخواستم به یک موض...
2,HW,1,0,verified,not_recommended,گیره های فلزی خیلی سخت تا میشوند و لذا حوله را...
3,MO,6,11,verified,no_idea,همه چیز در رابطه با ظاهر این گوشی بسیار خوب اس...
4,AC,19,4,verified,no_idea,اگر ظرفیتش براتون کافیه حتما بخرید._x000D_\nیه...
5,IT,6,1,verified,recommended,سلام دوستان،،_x000D_\nمنم مثه بعضی از دوستان ق...


In [10]:
# dropna

df[df.comment.isna() == True]['label'].value_counts()

not_recommended    12
recommended        12
no_idea             8
Name: label, dtype: int64

In [12]:
# drop duplicate

df[df.duplicated(subset=['comment']) == True].head(3)

Unnamed: 0,title_en,likes,dislikes,verification_status,label,comment
993,PA,3,1,verified,recommended,عالی
1695,IT,5,2,verified,recommended,حتما بخرید
1956,MO,4,43,verified,not_recommended,پیشنهاد نمیکنم


In [19]:
df[df.comment == 'عالی'].shape, \
df[df.comment == 'عالیه'].shape, \
df[df.comment == 'پیشنهاد نمیکنم'].shape

((155, 6), (53, 6), (17, 6))

In [37]:
class DataProcessor():

    def __init__(self, dataframe, 
                 comment_column='comment',
                 label_column='label', 
                 config=None):
#         for cnfg in config:
#             try:
#                 function, args = cnfg
#                 print(function, args)
#                 getattr(self,function)(**args)
#             except:
#                 getattr(self,cnfg)()
#     #load data    
#     def read_data(self, path = 'data/hamrah/train.csv', comment_column = 'comment', label_column= 'label', args={}):
        self.dataframe = dataframe # pd.read_csv(path, **args)
        self.comment_column = comment_column
        self.label_column = label_column

    #delete NAN rows base on comment column
    def dropna(self):
        self.dataframe = self.dataframe.dropna(subset=[self.comment_column]).reset_index(drop=True)

    #remove duplicates
    def drop_duplicates(self):
        self.dataframe = self.dataframe.drop_duplicates(subset=[self.comment_column]).reset_index(drop=True)
        
    def regex_clean(self, path):
        with open(path, 'r') as f:
        #    regexs = json.load(f)
            regexs = f.read()
            regexs = ast.literal_eval(regexs)
        re_pattern = re.compile("(" + "|".join(regexs.keys()) + ")")
        #lambda mo: regexs[mo.string[mo.start():mo.end()]]
        
        self.dataframe['cleaned_' + self.comment_column] = self.dataframe[self.comment_column].apply(
            lambda text: re.sub(re_pattern, '', text)
        )

    def normalize(self, **args):
        normalizer = hazm.Normalizer()
        self.dataframe['cleaned_' + self.comment_column] = self.dataframe[self.comment_column].apply(
            lambda text: normalizer.normalize(text)
        )
    
    def process(self):
        pass
#     def split(self,train_size=0.64, test_size=0.2, validation_size=0.16, random_state=40, balance=False):
#         #self.labels = list(sorted(self.clean_data[self.la].unique()))
#         #self.clean_data.loc[:,['label' + '_id']] = self.clean_data['label'].apply(lambda t: self.labels.index(t))
#         if balance:
#             balance = self.dataframe[self.label_column]
#         else:
#             balance = None
        
#         train, test = train_test_split(self.dataframe, test_size=0.2, random_state=random_state,stratify=balance)
#         train, val = train_test_split(train, test_size=0.2, random_state=random_state,stratify=balance)

#         self.train = train.reset_index(drop=True)
#         self.val = val.reset_index(drop=True)
#         self.test = test.reset_index(drop=True)

#     def save_output(self, path, mode, args_pd_to_csv):
#         if not os.path.isdir(path):
#             os.makedirs(path)
        
#         for name in ['train', 'val', 'test']:
#             name_path = DataProcessor.file_path_handler(path, name, '.csv', mode)
#             getattr(self, name).to_csv(name_path, **args_pd_to_csv)
        
#     def generate_config_template(output_directory):
#         template={
#             ('read_data', {'path':'data/hamrah/train.csv', 'comment_column': 'comment', 'label_column': 'label', 'args':{}}),
#             'dropna',
#             'drop_duplicates',
#             ('comment_length_filter', {'min_len':3, 'max_len':256}),
#             'balance',
#             ('stopwords', {'path': 'data/stopwords.txt'}),
#             ('normalize', {}),
#             ('regex_clean', {'path': 'data/regex.json'}),
#             ('split', {'train_size':0.8, 'test_size':0.15, 'validation_size':0.05, 'random_state':False}),
#             ('save_output', {'path':'data/hamrah/preprocess', 'mode':'number', 'args_pd_to_csv':{'encoding':'utf-8'}}), #mode can be eather overwrite or number
#         }
        
#         path = DataProcessor.file_path_handler(output_directory, 'preprocess_config_template', '.json')
#         with open(path, 'w') as f:
#             json.dump(template,f, indent=1)

#     def file_path_handler(output_directory, name, extension, mode='number'):
#         path = os.path.join(output_directory, name)
#         if mode == 'overwrite':
#             return path
#         else:
#             i = 0
#             while os.path.isfile(path+extension):
#                 path = os.path.join(output_directory, name+str(i))
#                 i+=1
#             if mode:
#                 path+=extension
#             return path

In [38]:
processor = DataProcessor(df)

processor.dropna()
processor.drop_duplicates()
processor.normalize()
processor.regex_clean('utils/regex_clean.txt')

clean_df = processor.dataframe.copy()

In [39]:
clean_df.isna().sum()

title_en               0
likes                  0
dislikes               0
verification_status    0
label                  0
comment                0
cleaned_comment        0
dtype: int64

In [40]:
clean_df['comment_len_char'] = clean_df['cleaned_comment'].str.len()

In [41]:
tokenizer = hazm.WordTokenizer()

clean_df['comment_len_word'] = clean_df['cleaned_comment'].apply(lambda x: len(tokenizer.tokenize(x)))

In [43]:
clean_df.sample(3)

Unnamed: 0,title_en,likes,dislikes,verification_status,label,comment,cleaned_comment,comment_len_char,comment_len_word
13615,PC,3,0,verified,recommended,تازه به دستم رسید رنگشو دوست دارم خوشرنگه و خی...,ازه به دستم رسید رنگشو دوست دارم خوشرنگه و خیل...,62,13
42981,TS,0,0,verified,recommended,خیلی قشنگه تو شگفت انگیز گرفتم و راضی هستم از ...,یلی قشنگه تو شگفت انگیز گرفتم و راضی هستم از خ...,50,11
21201,HW,1,0,verified,recommended,بسیار زیبا و کاربردیه. از خریدم راضی ام,سیار زیبا و کاربردیه از خریدم راضی ام,37,8


# 2. Data Preparation Phase

## A) identifying outliers

In [58]:
fig = px.box(clean_df, x=[
    'likes', 
                          'comment_len_char', 
                          'comment_len_word'
                         ]
            )
fig.show()

In [60]:
df_eda = clean_df.copy()

In [61]:
# IQR
def subset_by_iqr(df, column, whisker_width=1.5):
    Q1 = df[column].quantile(0.25)  
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    # set None for outlier data
    df[column + '_iqr'] = df[column]
    df.loc[((df[column] < Q1 - whisker_width*IQR) | (df[column] > Q3 + whisker_width*IQR)), column + '_iqr'] = None
    return df[column + '_iqr']



columns = ['likes', 'dislikes', 'comment_len_char', 'comment_len_word']

for col in columns:
    df_eda[col + '_iqr'] = subset_by_iqr(df_eda, col, whisker_width=1.5)


In [62]:
# Z score
columns = ['likes', 'dislikes', 'comment_len_char', 'comment_len_word']

for col in columns:
    df_eda[col + '_zscore'] = (df_eda[col] - df_eda[col].mean()) / df_eda[col].std()
    df_eda.loc[np.abs(df_eda[col + '_zscore']) > 3, col + '_zscore'] = None


In [63]:
# compare z_score and IQR ???

## B) Transformation & Standardization

In [None]:
# Z score, transformations and Skewness

In [76]:
df_eda['likes_tmp'] = np.log(df_eda.likes)


divide by zero encountered in log



In [77]:
px.box(df_eda, y='likes_tmp')

## C) Reclassify

In [None]:
# onehot vector

In [None]:
clean_df.head()

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
clean_df['verification_status_en'] = le.fit_transform(clean_df['verification_status'])
verification = dict(zip(le.classes_, le.transform(le.classes_)))
verification

In [None]:
le = LabelEncoder()
clean_df['label_encode'] = le.fit_transform(clean_df['label'])
label_encode = dict(zip(le.classes_, le.transform(le.classes_)))
label_encode

## D) Binning

In [None]:
# cut, pcut, and kmeans

In [None]:
fig = px.histogram(clean_df, x='comment_len_char', 
                   )

fig.update_xaxes(range=[0, 1000])
fig.update_layout(
    title_text='Distribution of comment length',
    xaxis_title_text='Comment Length (char)',
    yaxis_title_text='Frequency',
#     bargap=0.2,
    bargroupgap=0.1
)

fig.show()

In [64]:
fig = px.histogram(clean_df, x='comment_len_char', color='label',
                   color_discrete_map={
                       'recommended': 'green', 
                       'not_recommended': 'red', 
                       'no_idea': 'goldenrod'
                   })

fig.update_xaxes(range=[0, 1000])
fig.update_layout(
    title_text='Distribution of comment length',
    xaxis_title_text='Comment Length (char)',
    yaxis_title_text='Frequency',
#     bargap=0.2,
    bargroupgap=0.1
)

fig.show()

In [73]:
fig = px.histogram(df_eda, x='comment_len_char_zscore', color='label',
                   color_discrete_map={
                       'recommended': 'green', 
                       'not_recommended': 'red', 
                       'no_idea': 'goldenrod'
                   }, barnorm='fraction' # 'precent'
                  )

# fig.update_xaxes(range=[0, 1000])
fig.update_layout(
    title_text='Distribution of comment length',
    xaxis_title_text='Comment Length (char)',
    yaxis_title_text='Frequency',
#     bargap=0.2,
    bargroupgap=0.1
)

fig.show()

# 3. Exploratory Data Analysis Phase

## A

In [None]:
df2 = clean_df[['likes', 'dislikes', 'verification_status', 'label_encode', 'comment_len_char', 'comment_len_word']].copy()


In [None]:
import seaborn as sns

plt.figure(figsize=(20, 10))
sns.heatmap(df2.corr(), cmap ='YlGnBu', annot = True, fmt='.3f', linewidths = 0.30)

In [None]:
fig = px.scatter_matrix(df2)
fig.show()

In [None]:
fig = px.scatter_matrix(df2,
                       color='label_encode',
                       color_discrete_map={
                       0: 'green', 
                       1: 'red', 
                       2: 'goldenrod'
                   })
fig.show()

In [None]:
label_encode

In [None]:
fig = px.scatter(df2, x="comment_len_char", y="likes", color="label_encode",
                labels={
                       0: '#000000', 
                       1: 'red', 
#                        2: 'goldenrod'
                   })
fig.show()

In [None]:

fig = go.Figure(data=go.Heatmap(
        z=df2.corr(),
        x=df2.columns,
        y=df2.columns,
        colorscale='RdYlGn',
    hoverongaps = False
))

fig.update_layout(
    title='GitHub commits per day',
)
fig.update_l


fig.show()

## C) Binning

## D) new features