In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
import re

In [2]:
df = pd.read_csv('sentiment140.csv', encoding = 'ISO-8859-1', header = None, nrows = 500)
df.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']

In [3]:
cols = ['id', 'date', 'query', 'user']
df = df.drop(cols, axis = 1)

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

tokenizer = TweetTokenizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [5]:
def clean_text(text):

    # clean text of urls, mentions and hashtags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)

    # send the text to lowercase
    text = text.lower()

    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # tokenize text
    tokens = tokenizer.tokenize(text)

    # remove stopwords and stem words
    processed_tokens = []
    for token in tokens:
        if token not in stop_words:
            stemmed_token = stemmer.stem(token)
            processed_tokens.append(stemmed_token)
    tokens = processed_tokens

    return " ".join(tokens)

In [6]:
df['cleaned_text'] = df['text'].apply(clean_text)

In [7]:
print(df['text'].head())

0    @switchfoot http://twitpic.com/2y1zl - Awww, t...
1    is upset that he can't update his Facebook by ...
2    @Kenichan I dived many times for the ball. Man...
3      my whole body feels itchy and like its on fire 
4    @nationwideclass no, it's not behaving at all....
Name: text, dtype: object


In [8]:
print(df['cleaned_text'].head())

0         that bummer shoulda got david carr third day
1    upset cant updat facebook text might cri resul...
2      dive mani time ball manag save 50 rest go bound
3                      whole bodi feel itchi like fire
4                                behav im mad cant see
Name: cleaned_text, dtype: object


In [None]:
class MLP(nn.Module):
    