In [18]:
# Import libraries
import pandas as pd
import string
import contractions
from bs4 import BeautifulSoup
import nltk 
import spacy
import string
import itertools
import re

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amr_a\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### 1) Load data and concat them

In [19]:
yelp = pd.read_table('../Data/yelp_labelled.txt', on_bad_lines='skip')

fir = yelp.columns  #Get the first row for the header
new_header = ['Sentiment', 'Class'] #Take the data less the header row
yelp.columns = new_header #Set the header row as the df header
yelp.iloc[-1] = fir

yelp.head()

Unnamed: 0,Sentiment,Class
0,Crust is not good.,0
1,Not tasty and the texture was just nasty.,0
2,Stopped by during the late May bank holiday of...,1
3,The selection on the menu was great and so wer...,1
4,Now I am getting angry and I want my damn pho.,0


In [20]:
amazon = pd.read_table('../Data/amazon_cells_labelled.txt', on_bad_lines='skip')

fir = amazon.columns  #Get the first row for the header
new_header = ['Sentiment', 'Class'] #Take the data less the header row
amazon.columns = new_header #Set the header row as the df header
amazon.iloc[-1] = fir

amazon.head()

Unnamed: 0,Sentiment,Class
0,"Good case, Excellent value.",1
1,Great for the jawbone.,1
2,Tied to charger for conversations lasting more...,0
3,The mic is great.,1
4,I have to jiggle the plug to get it to line up...,0


In [21]:
amazon.shape

(999, 2)

In [22]:
imdb = pd.read_table('../Data/imdb_labelled.txt', on_bad_lines='skip')

fir = imdb.columns  #Get the first row for the header
new_header = ['Sentiment', 'Class'] #Take the data less the header row
imdb.columns = new_header #Set the header row as the df header
imdb.iloc[-1] = fir

imdb.head()

Unnamed: 0,Sentiment,Class
0,Not sure who was more lost - the flat characte...,0
1,Attempting artiness with black & white and cle...,0
2,Very little music or anything to speak of.,0
3,The best scene in the movie was when Gerardo i...,1
4,"The rest of the movie lacks art, charm, meanin...",0


In [23]:
imdb.shape

(747, 2)

In [24]:
data = pd.concat([yelp, amazon])

data = pd.concat([data, imdb])

data.shape

(2745, 2)

In [25]:
data.head()

Unnamed: 0,Sentiment,Class
0,Crust is not good.,0
1,Not tasty and the texture was just nasty.,0
2,Stopped by during the late May bank holiday of...,1
3,The selection on the menu was great and so wer...,1
4,Now I am getting angry and I want my damn pho.,0


In [27]:
data.isnull().sum()

Sentiment    0
Class        0
dtype: int64

### 2) Text Preprocessing

In [28]:
# 2) Normalizing Text (lower) 

def normalize(content):
    content = content.lower()
    return content

# 3) Removing Puncuatations

def remove_punct(content):
    content = content.translate(content.maketrans("", "", string.punctuation))
    return content

#  4) cleaning digits

def remove_num(content):
    content = ''.join([i for i in content if not i.isdigit()])

    return content

# 5) Remove extra-space

def remove_spaces(content):
    content = " ".join(content.split())

    return content

# 6)# Remove Contraction

def remove_cont(content):
    content = contractions.fix(content)
    
    return content

# 7) Remove Html Tags

def remove_html(content):
    # parse html content
    soup = BeautifulSoup(content, "html.parser")

    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    content = ' '.join(soup.stripped_strings)
    
    return content

# 8) Remove URLs and E-mails (UniCode)

def remove_unicode(content):
    content = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", content)
    return content

# 9) Removing Stop Words

def remove_stopword(content):
    STOPWORDS = set(stopwords.words('english'))
    content = " ".join([word for word in str(content).split() if word not in STOPWORDS])
    return content

# 10) Standardizing and Spell Check

def Standardize(content):
    content = ''.join(''.join(s)[:2] for _, s in itertools.groupby(content))
    spell = Speller(lang='en')
    content = spell(content)
    return content

# 11) Remove some Extra-words

def remove_extrawords(content):
    stop=['href','lt','gt','ii','iii','ie','quot','com']  ## This all words are most repeated words it does not make any sense
    content = content.split(" ")
    filtered_list=[]
    for i in content:
        if i not in stop:
            filtered_list.append(i)
            
    content = ' '.join(filtered_list)
    return content

In [29]:
def clean_text(content):
    
    # Normalize
    
    Normalized_content = normalize(content)
    
    # Removing Puncuatations
    
    clean_content = remove_punct(Normalized_content)

    # cleaning digits

    clean_content = remove_num(clean_content)

    # Remove extra-space
    
    clean_content = remove_spaces(clean_content)
    
    # Remove Contraction

    clean_content = remove_cont(clean_content)
    
    # Remove Html Tags

    clean_content = remove_html(clean_content)
    
    # Remove URLs and E-mails (UniCode)

    clean_content = remove_unicode(clean_content)
    
    # Removing Stop Words

    clean_content = remove_stopword(clean_content)

    # Remove some Extra-words
    
    clean_content = remove_extrawords(clean_content)

    return clean_content

In [30]:
data['Sentiment'] = data['Sentiment'].apply(clean_text)

In [31]:
data.head()

Unnamed: 0,Sentiment,Class
0,crust good,0
1,tasty texture nasty,0
2,stopped late may bank holiday rick steve recom...,1
3,selection menu great prices,1
4,getting angry want damn pho,0


In [17]:
data.to_csv(r'../Data/cleaned_reviews.csv', index = False)