In [24]:
import re
import string
import pandas as pd
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [53]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Ammar\AppData\Roaming\nltk_data...


True

In [64]:
df = pd.read_csv(r'./tweets.csv')
df.head()

Unnamed: 0,Tweet,Avg
0,"I have to say, Apple has by far the best custo...",2.0
1,iOS 7 is so fricking smooth & beautiful!! #Tha...,2.0
2,LOVE U @APPLE,1.8
3,"Thank you @apple, loving my new iPhone 5S!!!!!...",1.8
4,.@apple has the best customer service. In and ...,1.8


In [65]:
df = df['Tweet']
df = df.astype('str')
df.shape

(1181,)

In [66]:
df.head()

0    I have to say, Apple has by far the best custo...
1    iOS 7 is so fricking smooth & beautiful!! #Tha...
2                                        LOVE U @APPLE
3    Thank you @apple, loving my new iPhone 5S!!!!!...
4    .@apple has the best customer service. In and ...
Name: Tweet, dtype: object

In [76]:
# For Removing Punctuation Symbols and Stop Words From Tweets
translate_table = dict((ord(char), None) for char in string.punctuation)
stop_words = set(stopwords.words('english'))

In [77]:
# Adding custom stopwords
new_stop_words = [
    'some', 'like', 'think', 'wow', 'one', 'http', 'web', 'really', 
    'see', 'watch', 'apple', 'know', 'show', 'think', 'click', 'go', 'to', 'great', 
    'very', 'good', 'many', 'more', 'people', 'made', 'technology', 'tech',
    'iphone', 'ipad', 'new', 'latest', 'phone', 'itunes', 'brand', 'ipod', 'iphones', 
    'io', 'get', 'buy', 'purchase', 'make', 'im', "iam", 'dont', 'cant', 'promoipodplayerpromo',
    'ipodplayerpromo', 'player', 'itune']
stop_words = stop_words.union(new_stop_words)

In [78]:
# Text Pre-Processing
corpus = []
for i, line in df.iteritems():
    line = line.lower()
    line = re.sub(r"\d+", "", line)
    line = line.translate(translate_table)
    line = line.split()
    
    # Lemmatizers reduces each word to its root/canonical form
    lm = WordNetLemmatizer()
    line = [lm.lemmatize(word) for word in line if not word in stop_words]
    line = " ".join(line)
    corpus.append(line)

corpus[1100]

'teaser trailer macbook pro freak'

In [79]:
from sklearn.feature_extraction.text import CountVectorizer 

In [80]:
# We ignore the words that appear in 70% of documents in the corpus
def get_top_keywords(corpus, upto=None):    
    cv = CountVectorizer(max_df=0.7,
                        stop_words=stop_words,
                        ngram_range=(1,2),
                        min_df=0.001)
    X = cv.fit_transform(corpus)
    bag_of_words = cv.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in cv.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x : x[1], reverse=True)
    return words_freq[:upto]

In [83]:
top_words = get_top_keywords(corpus, upto=70)
top_words_df = pd.DataFrame(top_words)
top_words_df.columns = ['Word', 'Frequency']

In [85]:
top_words_df

Unnamed: 0,Word,Frequency
0,app,39
1,de,39
2,freak,39
3,store,38
4,rt,38
...,...,...
65,freaking,14
66,news,14
67,could,14
68,using,14
