In [23]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.feature_extraction.text import TfidfVectorizer

In [24]:
data = pd.read_csv('tweet_emotions.csv')
print(data['content'])

0        @tiffanylue i know  i was listenin to bad habi...
1        Layin n bed with a headache  ughhhh...waitin o...
2                      Funeral ceremony...gloomy friday...
3                     wants to hang out with friends SOON!
4        @dannycastillo We want to trade with someone w...
                               ...                        
39995                                     @JohnLloydTaylor
39996                       Happy Mothers Day  All my love
39997    Happy Mother's Day to all the mommies out ther...
39998    @niariley WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEE...
39999    @mopedronin bullet train from tokyo    the gf ...
Name: content, Length: 40000, dtype: object


In [25]:
def data_processing(content):
    content = content.lower()
    content = re.sub(r"https\S+|www\S+http\S+",'', content, flags=re.MULTILINE)
    content = re.sub(r'\@w+|\#','', content)
    content = re.sub(r'[^\w\s]','', content)
    content_token = word_tokenize(content)
    filtered_content = [w for w in content_token if not w in stop_words]
    return " ".join(filtered_content)

In [26]:
data.content = data['content'].apply(data_processing)

In [27]:
print(data['content'])

0        tiffanylue know listenin bad habit earlier sta...
1                   layin n bed headache ughhhhwaitin call
2                            funeral ceremonygloomy friday
3                                  wants hang friends soon
4        dannycastillo want trade someone houston ticke...
                               ...                        
39995                                      johnlloydtaylor
39996                               happy mothers day love
39997    happy mothers day mommies woman man long youre...
39998    niariley wassup beautiful follow peep new hit ...
39999    mopedronin bullet train tokyo gf visiting japa...
Name: content, Length: 40000, dtype: object


In [28]:
stemmer = PorterStemmer()
def stemming(data):
    content = [stemmer.stem(word) for word in data]
    return data

In [29]:
data['content'] = data['content'].apply(lambda x: stemming(x))

In [30]:
vect = TfidfVectorizer(ngram_range=(1,2)).fit(data['content'])
feature_names = vect.get_feature_names()
print(feature_names[:20])

['00', '00 graduated', '000', '000 dunno', '000 httpplurkcompwxj54', '0003', '0003 im', '002', '006', '006 totally', '01', '01 final', '01 girls', '01 mm', '010', '010 050', '0128', '0128 morning', '01theone', '01theone looking']


