In [29]:
import pandas as pd
import os, re

In [30]:
!pip install nltk
import nltk
nltk.download('stopwords')



[nltk_data] Downloading package stopwords to
You should consider upgrading via the 'C:\UIT_repos\methods\ai_methods\venv\Scripts\python.exe -m pip install --upgrade pip' command.

[nltk_data]   Package stopwords is already up-to-date!


True

# Load data from file

In [31]:
# checks if file is relevant for dataset
def is_relevant_file(name, root):
    return name[-3:] == 'txt' and root[-3:] == 'pos' or root[-3:] == 'neg'

# returns the dataset type for a given root
def get_dataset_type(root):
    if 'test' in root:
        return 'test'
    elif 'train' in root:
        return 'train'
    else:
        print("na")

In [32]:
data = []
path = 'aclImdb' # insert correct path for folder aclImdb

In [33]:
for root, dirs, files in os.walk(path):
    for i,name in enumerate(files):
        row = {'name': None, 'text': None, 'dataset': None, 'label': None}
        if is_relevant_file(name, root):
            with open(os.path.join(root, name), encoding='utf8') as f:
                row['name'] = name
                row['text'] = f.read()
                row['dataset'] = get_dataset_type(root)
                row['label'] = root[-3:]
                data.append(row)

# Create pd DataFrame with data

In [34]:
df = pd.DataFrame.from_records(data)
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,Once again Mr. Costner has dragged out a movie...,test,neg
1,10000_4.txt,This is an example of why the majority of acti...,test,neg
2,10001_1.txt,"First of all I hate those moronic rappers, who...",test,neg
3,10002_3.txt,Not even the Beatles could write songs everyon...,test,neg
4,10003_3.txt,Brass pictures (movies is not a fitting word f...,test,neg


# Perform Pre-processing on data
**Note:** The following operations are made using methods from the 'text_preproecssing.ipynb' file provided by the course. I have decided to process the text without storing intermediate columns in order to make a simple dataframe that is tailored for a specific imaginative purpose.

**Preprocessing steps include**
- lower casing
- removing punctuation
- removing stopwords
- removing most frequent words
- removing most rare words
- removing emojis
- translating emoticons to words
- removing url's

#### Lower Casing

In [46]:
df["text"] = df["text"].str.lower()
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,first hate moronic rappers couldnt act gun pre...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### Removing punctuation

In [47]:
import string

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

df["text"] = df["text"].apply(lambda text: remove_punctuation(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,first hate moronic rappers couldnt act gun pre...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### Stopwords Removal

In [48]:

from nltk.corpus import stopwords
", ".join(stopwords.words('english'))

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text"] = df["text"].apply(lambda text: remove_stopwords(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,first hate moronic rappers couldnt act gun pre...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### Remove Frequent Words

In [49]:
from collections import Counter
cnt = Counter()
for text in df["text"].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common(10)

[('see', 22534),
 ('story', 22090),
 ('much', 18947),
 ('well', 18791),
 ('get', 18204),
 ('great', 17819),
 ('also', 17816),
 ('bad', 17704),
 ('people', 17538),
 ('first', 17154)]

In [50]:
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["text"] = df["text"].apply(lambda text: remove_freqwords(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### Rare Words Removal

In [51]:
n_rare_words = 10
RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

df["text"] = df["text"].apply(lambda text: remove_rarewords(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### Emojis Removal

In [52]:
def remove_emoji(string):
    'src: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b'
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)
df["text"] = df["text"].apply(lambda text: remove_emoji(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


In [53]:
# src : https://github.com/NeelShah18/emot/blob/master/emot/emo_unicode.py
EMOTICONS = {
    u":‑\)":"Happy face or smiley",
    u":\)":"Happy face or smiley",
    u":-\]":"Happy face or smiley",
    u":\]":"Happy face or smiley",
    u":-3":"Happy face smiley",
    u":3":"Happy face smiley",
    u":->":"Happy face smiley",
    u":>":"Happy face smiley",
    u"8-\)":"Happy face smiley",
    u":o\)":"Happy face smiley",
    u":-\}":"Happy face smiley",
    u":\}":"Happy face smiley",
    u":-\)":"Happy face smiley",
    u":c\)":"Happy face smiley",
    u":\^\)":"Happy face smiley",
    u"=\]":"Happy face smiley",
    u"=\)":"Happy face smiley"
}

#### Convert Emoticons to words

In [54]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()), text)
    return text

df["text"] = df["text"].apply(lambda text: convert_emoticons(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


#### URLs Removal

In [55]:
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
df["text"] = df["text"].apply(lambda text: convert_emoticons(text))
df.head()

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg


# Printing the complete processed dataframe

In [56]:
df

Unnamed: 0,name,text,dataset,label
0,0_2.txt,mr costner dragged far longer necessary aside ...,test,neg
1,10000_4.txt,example majority action films generic boring t...,test,neg
2,10001_1.txt,hate moronic rappers couldnt act gun pressed f...,test,neg
3,10002_3.txt,beatles could write songs everyone liked altho...,test,neg
4,10003_3.txt,brass pictures movies fitting word somewhat br...,test,neg
...,...,...,...,...
49995,9998_9.txt,seeing vote average pretty low fact clerk vide...,train,pos
49996,9999_8.txt,plot wretched unbelievable twists however chem...,train,pos
49997,999_10.txt,amazed movieand others average 5 stars lower c...,train,pos
49998,99_8.txt,christmas together actually came ive raised jo...,train,pos
