### Reading Data

In [None]:
import pandas as pd

df = pd.read_csv('/content/Digikala-comments/Data/Data.csv')
df.head()

### Cleaning Data

In [None]:
# Null Values
df = df[~((df.title.isnull()) & (df.comment.isnull()))]
df = df.reset_index(drop=True)

In [None]:
# Data Types
def set_types(df):
    df.title = df.title.astype(str)
    df.comment = df.comment.astype(str)
    df.rate = df.rate.astype('int')
    return df
df = set_types(df)

In [None]:
# Managing NaNs
def replace_nan(entry):
    if entry == 'nan':
        return '#'
    return entry
df.title = df.title.apply(replace_nan)
df.comment = df.comment.apply(replace_nan)

In [None]:
# Making our sentiment colmn
df['Phrase'] = df['title'] + ' ' + df['comment']
df['Sentiment'] = df['rate']
df.drop(columns=['title', 'verification_status', 'comment', 'rate'], inplace=True) 
df.head(1)

In [None]:
# Duplicates
df.drop_duplicates(subset =['Phrase'], inplace = True)

#### more cleaning

In [None]:
import numpy as np
from hazm import word_tokenize, stopwords_list, InformalLemmatizer
import re
lemma = InformalLemmatizer()

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py  emoticons list
# https://github.com/rishabhverma17/sms_slang_translator/blob/master/slang.txt Chat shortcuts

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r' ', text)

def remove_punctuations(text):
    punctuations = re.compile(r'[~`!@#$%^&*(,<،>){}\\/|\'"?؟_+-=~\[\]]')
    return punctuations.sub(r' ', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r' ', text)

def remove_weird_chars(text):
    weridPatterns = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)
    patterns = [re.compile('\r'), re.compile('\n'), re.compile('&amp;')]
    text = weridPatterns.sub(r'', text)
    for p in patterns:
        text = p.sub(r' ', text)
    return text

def remove_extra_repeated_alpha(text):
    """
    Remove extra repeated alphabets in a word
    check these links:
    demo : https://regex101.com/r/ALxocA/1
    Question: https://bit.ly/2DoiPqS
    """
    return re.sub(r'([^\W\d_])\1{2,}', r'\1', text)


def clean_up(text, url=True, html=True, weird_patterns=True , lemmatize=False, 
               stopwords=True, isalpha=False, punctuations=True, remove_extra_alpha=True):
    # remove url
    if url:
        text = remove_urls(text)
    # remove html tags
    if html:
        text = remove_html(text)
    # remove emokis / symbols & pictographs / transport & map symbols / flags (iOS)
    if weird_patterns:
        text = remove_weird_chars(text)
    # remove punctuations
    if punctuations:
        text = remove_punctuations(text)
    # Alter words with repeated alphabets
    if remove_extra_repeated_alpha:
        text = remove_extra_repeated_alpha(text)
    # tokenize text
    tokens = word_tokenize(text)
    # remove stop words
    if stopwords:
        tokens = [word for word in tokens if word not in stopwords_list()]
    # remove non-alphabetic items
    if isalpha:
        tokens = [word for word in tokens if word.isalpha()]
    # lemmatize words
    if lemmatize:
        tokens = [lemma.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)
    
    return text



In [None]:
# not cleaned example
test = df.loc[502].Phrase
test

In [None]:
clean_up(test)

In [None]:
# Applying our function to Phrases
# may take some time
# df['Phrase'] = df['Phrase'].apply(clean_up)

In [None]:
# Saving the cleaned data
# df.to_csv('/content/drive/MyDrive/Digikala-comments/Data/Cleaned-data.csv')

In [None]:
# Distribution of sentiment classes across our data.
df = pd.read_csv('/content/drive/MyDrive/Digikala-comments/Data/Cleaned-data.csv')
df.head()