In [109]:
import pandas
import math
import numpy
import main
from datetime import timedelta

We import all the data we wrote into an excel file for the n-gram models and check if they're read correctly. Further, we import the actual json data provided as well.

In [2]:
oneGram = pandas.read_csv('1-gram outputs.csv', index_col=0, header=0, keep_default_na=False, na_values=[''])

In [3]:
twoGrams = pandas.read_csv('2-gram outputs.csv', index_col=0, header=0)

In [4]:
origData = pandas.read_json('News_Category_Dataset_v2.json', lines=True)

In [5]:
print(oneGram.head())
print('Length: ' + str(len(oneGram)) + '\n\n')
print(twoGrams.head())
print('Length: ' + str(len(twoGrams)))

                       Frequency
a                         134348
aa                            14
aaa                            7
aaaaaaaaaaaaaaahhhhhh          1
aaaaaah                        1
Length: 84465


               Frequency
a a                   37
a aaron                1
a aawful               1
a about                1
a abstraction          1
Length: 1797575


We sort the 1-gram list to see what the most common words are. Unfortunately, they are what are known as "stop words". In order to remove them, we use the natural language toolkit to import a list of the most commonly used stop words. Further, we also want to remove plurals, and for that purpose we use the SnowballStemmer package inside nltk to find and remove all words with the same stem (except for one).

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [7]:
oneGram.drop(stop_words, inplace=True, errors='ignore')
print(oneGram.head())

                       Frequency
aa                            14
aaa                            7
aaaaaaaaaaaaaaahhhhhh          1
aaaaaah                        1
aaaargh                        1


In [8]:
stemmedDict = dict(zip([stemmer.stem(ind) for ind in oneGram.index.tolist()], oneGram.index.tolist()))
# creating a dict automatically drops duplicate values so the list above is unique
oneGram.drop([ind for ind in oneGram.index.tolist() if ind not in stemmedDict[stemmer.stem(ind)]!=ind], inplace=True, errors='ignore')
oneGram.sort_values(by=['Frequency'], axis=0, ascending=False, inplace=True)
print(oneGram.head())

        Frequency
new         18771
trump       18252
one         16352
people      12267
photos      11375


In the 2-gram list, we again remove any phrases containing stop-words since they do not convey any additional info and the remaining word that they are combined with would already be in the 1-gram list. This reduces the size considerably and allows for more complicated operations to be performed.

The stemming part for 2-grams is a little more complicated, but essentially we separate the first and second word from the indexes into 2 separate lists. Then, we find stem words for the first word list, and piece together a unique list of 2 word strings by combining it back with the list of second words. Anything not appearing in this unique list is removed. We repeat the operation from the other side (find stem words for second word list and so on).

In [9]:
twoGrams.sort_values(by=['Frequency'], ascending=False, inplace=True)
print(twoGrams.head())

        Frequency
of the      23037
in the      20092
it s        14745
to the       9588
to be        9160


In [10]:
# this cell takes a little bit of time, but the clean up ratio is worth it!

deleter_column = []
for key in twoGrams.index.tolist():
    for stpword in stop_words:
        if (stpword+' ' in key.split(' ')[0]+' ') or (' '+stpword in ' '+key.split(' ')[-1]):
            deleter_column.append(True)
            break
        if stop_words.index(stpword) == len(stop_words)-1:
            deleter_column.append(False)

twoGrams['deleter_col']=deleter_column
twoGrams.drop(twoGrams[twoGrams['deleter_col']==True].index, inplace=True)

twoGrams.sort_values(by=['Frequency'], ascending=False, inplace=True)
twoGrams.drop('deleter_col', axis=1, inplace=True)
del deleter_column
print(twoGrams.head())

                  Frequency
white house            1568
health care            1249
climate change         1107
twitter facebook        888
york city               691


In [11]:
firstWordList = [ind.split(' ')[0] for ind in twoGrams.index.tolist()]
secondWordList = [ind.split(' ')[1] for ind in twoGrams.index.tolist()]
stemmedDict = dict(zip([stemmer.stem(firstWordList[ind]) + ' ' + secondWordList[ind] for ind in range(len(firstWordList))],
                       twoGrams.index.tolist()))
twoGrams.drop([ind for ind in twoGrams.index.tolist() if stemmedDict[stemmer.stem(ind.split(' ')[0])+' '+ind.split(' ')[1]]!=ind], inplace=True, errors='ignore')

In [12]:
firstWordList = [ind.split(' ')[0] for ind in twoGrams.index.tolist()]
secondWordList = [ind.split(' ')[1] for ind in twoGrams.index.tolist()]
stemmedDict = dict(zip([firstWordList[ind] + ' ' + stemmer.stem(secondWordList[ind]) for ind in range(len(firstWordList))],
                       twoGrams.index.tolist()))
twoGrams.drop([ind for ind in twoGrams.index.tolist() if stemmedDict[ind.split(' ')[0]+' '+stemmer.stem(ind.split(' ')[1])]!=ind], inplace=True, errors='ignore')

We consider the word list to be sufficiently clean. As we can see, we have brought down the size of 1-grams to ~58,000 from the initial list of ~84,000. With 2-grams, we have done even better and brought down the list close to 200,000 from an initial list which was closer to 1.8 million. Now that we've cleaned the data set, we get down to analysis.

To find the most important words, we try to negate the effect of "filler words" that have not been cleaned with the stop words list (such as "new", "one", "time" etc). To do this, we sort the 1-gram list by the product of word length and (log of) frequency, based on the intuition that filler words are shorter and we can reduce the impact of their frequency by taking log. This turns out to have commonality with the TF-IDF method, which isn't directly applicable here.

In [13]:
oneGram['helper_col'] = oneGram.apply(lambda key: len(key.name)*math.log(key), axis=1)
oneGram.sort_values(by=['helper_col'], axis=0, ascending=False, inplace=True)

We repeat the same for the 2-gram list. Although we could use the 1-gram list to further enhance the 2-gram filter instead of repeating the process, we tend to lost information through that method. For example, while "white" may be a low importance filler word in the 1-gram analysis, "white house" would be an important phrase in the 2-gram list. Hence, using the importance of "white" from the 1-gram analysis does not add (and in fact, detracts) to our analysis of the 2-gram list.

In [184]:
twoGrams['helper_col'] = twoGrams.apply(lambda key: len(key.name)*math.log(key), axis=1)
twoGrams.sort_values(by=['helper_col'], axis=0, ascending=False, inplace=True)

Further, we take the original data into consideration. Since the dataset has a category column, we can use it to enhance our analysis, along with the headlines. We measure the importance of a single word by considering headlines within a given category, and measuring their occurrence within them. We make this into a new dataframe.

In [206]:
sourceDates = pandas.to_datetime(origData.date.unique())
sourceDates = [dt for dt in sourceDates if dt.weekday()==4]

categoryDataFrame = pandas.DataFrame(columns = origData.category.unique(), index = pandas.MultiIndex.from_product([[1,2], sourceDates]))
del sourceDates

In [None]:
for dt in categoryDataFrame.index.get_level_values(1):
    print(dt)
    for cat in categoryDataFrame.columns:
        weekData =  origData[origData.date.isin([dt-timedelta(days=i) for i in range(0,5)]) & origData.category.isin([cat])]
        texts = ' '.join(weekData['headline'].tolist() + weekData['short_description'].tolist()).lower()
            
        # use our original n-grams module to create a new set of n-grams and compare them to the existing list
        twoWord = main.ngram(texts, 2, writeToFile=False, returnFrame=True)
        # make use of all the clean up done above to drop stop words, stem copies etc
        twoWord = twoWord[twoWord.index.isin(twoGrams.index)]
        twoWord = twoWord.assign(Frequency = twoGrams['helper_col'].loc[twoWord.index]).sort_values('Frequency', ascending=False)
        categoryDataFrame[cat].loc['2', dt] = twoWord.T.to_dict(orient='list')
        
        oneWord = main.ngram(texts, 1, writeToFile=False, returnFrame=True)
        oneWord = oneWord[oneWord.index.isin(oneGram.index)]
        oneWord = oneWord.assign(Frequency = oneGram['helper_col'].loc[oneWord.index]).sort_values('Frequency', ascending=False)
        categoryDataFrame[cat].loc['1', dt] = oneWord.T.to_dict(orient='list')

2018-05-25 00:00:00
2018-05-18 00:00:00
2018-05-11 00:00:00
2018-05-04 00:00:00
2018-04-27 00:00:00
2018-04-20 00:00:00
2018-04-13 00:00:00
2018-04-06 00:00:00
2018-03-30 00:00:00
2018-03-23 00:00:00
2018-03-16 00:00:00
2018-03-09 00:00:00
2018-03-02 00:00:00
2018-02-23 00:00:00
2018-02-16 00:00:00
2018-02-09 00:00:00
2018-02-02 00:00:00
2018-01-26 00:00:00
2018-01-19 00:00:00
2018-01-12 00:00:00
2018-01-05 00:00:00
2017-12-29 00:00:00
2017-12-22 00:00:00
2017-12-15 00:00:00
2017-12-08 00:00:00
2017-12-01 00:00:00
2017-11-24 00:00:00
2017-11-17 00:00:00
2017-11-10 00:00:00
2017-11-03 00:00:00
2017-10-27 00:00:00
2017-10-20 00:00:00
2017-10-13 00:00:00
2017-10-06 00:00:00
2017-09-29 00:00:00
2017-09-22 00:00:00
2017-09-15 00:00:00
2017-09-08 00:00:00
2017-09-01 00:00:00
2017-08-25 00:00:00
2017-08-18 00:00:00
2017-08-11 00:00:00
2017-08-04 00:00:00
2017-07-28 00:00:00
2017-07-21 00:00:00
2017-07-14 00:00:00
2017-07-07 00:00:00
2017-06-30 00:00:00
2017-06-23 00:00:00
2017-06-16 00:00:00


In [205]:
oneWord.T.to_dict(orient='list')

SyntaxError: invalid syntax (<ipython-input-205-1ef48760cafa>, line 1)

In [194]:
origData.date.min()

Timestamp('2012-01-28 00:00:00')

In [160]:
oneGram[oneGram.Frequency==oneWord.Frequency.loc['students']]

Unnamed: 0,Frequency,helper_col


In [193]:
whos

Variable            Type                    Data/Info
-----------------------------------------------------
SnowballStemmer     ABCMeta                 <class 'nltk.stem.snowball.SnowballStemmer'>
cat                 str                     WELLNESS
categoryDataFrame   DataFrame                            CRIME ENTERT<...>\n[660 rows x 41 columns]
datetime            module                  <module 'datetime' from '<...>37-32\\lib\\datetime.py'>
deleter_column      list                    n=1797575
dt                  Timestamp               2012-04-27 00:00:00
firstWordList       list                    n=212390
importlib           module                  <module 'importlib' from <...>\importlib\\__init__.py'>
key                 str                     zzzzzz gah
main                module                  <module 'main' from 'C:\\<...>graham capital\\main.py'>
math                module                  <module 'math' (built-in)>
ngram               function                <function