In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm
import re, string
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
CachedStopWords = stopwords.words("english")
from nltk.tokenize import word_tokenize

PUNCT_TO_REMOVE = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moham\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Import data

In [2]:
# For IMDB data
data=pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# For YELP data
#data=pd.read_csv('YELP Dataset.csv')
#data.head()

## Functions to clean data

In [3]:
def remove_punctuation(text):    
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in CachedStopWords])
    

# Clean the data

In [4]:
def clean_text(text):
    
    # Lower Casing
    text = text.lower()
    
    # Remove url
    text = remove_urls(text) 
    
    # Remove html tags
    text = remove_html(text)
    
    # Removing @tags
    text = re.sub('@\w*','',text)
    
    # Removing Punctuations
    text = remove_punctuation(text)
    
    # Removing new lines
    text = re.sub('\\n',' ',text)
    
    # Remove stop words
    text = remove_stopwords(text)
    
    return text

In [6]:
data["clean_text"] = data["review"].apply(lambda text: clean_text(text))

In [7]:
data

Unnamed: 0,review,sentiment,labels,clean_text
0,One of the other reviewers has mentioned that ...,positive,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter matteis love time money visually stunni...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,0,catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,negative,0,im going disagree previous comment side maltin...


## Handle the output varibale encoding

In [8]:
# For IMDB data
#labels = {'positive': 1, 'negative': 0}
#data['labels'] = data.sentiment.replace(labels)
#data

Unnamed: 0,review,sentiment,labels,clean_text
0,One of the other reviewers has mentioned that ...,positive,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,1,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,petter matteis love time money visually stunni...
...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,1,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,0,catholic taught parochial elementary schools n...
49998,I'm going to have to disagree with the previou...,negative,0,im going disagree previous comment side maltin...


## Split data into train, test, split (70:15:15)

In [12]:
# For IMDB data
X_train, X_test, y_train, y_test = train_test_split(data[['clean_text']], data[['labels']], 
                                                    test_size=0.15, random_state=1,stratify=data[['labels']])

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                  test_size=0.17647, random_state=1, stratify=y_train) # 0.17647 x 0.85 = 0.15

In [None]:
# For YELP data
#X_train, X_test, y_train, y_test = train_test_split(data[['clean_text']], data[['labels']], 
#                                                    test_size=0.3, random_state=1,stratify=data[['labels']])

#X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, 
#                                                test_size=0.5, random_state=1, stratify=y_test)

In [None]:
X_train['labels'] = y_train
X_test['labels'] = y_test
X_val['labels'] = y_val

## Save data

In [14]:
# For IMDB data
X_train.to_csv('IMDB_train.csv',index=False)
X_test.to_csv('IMDB_test.csv',index=False)
X_val.to_csv('IMDB_val.csv',index=False)

In [None]:
# For YELP data
#X_train.to_csv('YELP_train.csv',index=False)
#X_test.to_csv('YELP_test.csv',index=False)
#X_val.to_csv('YELP_val.csv',index=False)