In [1]:
import nltk
import pandas as pd
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
df["tokenized"] = df["text"]
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x) 
df["tokenized"] = df['tokenized'].str.replace('[^\w\s]','')

In [6]:
df["tokenized"] = df["tokenized"].apply(word_tokenize)
df["tokenized"].head()

0    [our, deeds, are, the, reason, of, this, earth...
1        [forest, fire, near, la, ronge, sask, canada]
2    [all, residents, asked, to, shelter, in, place...
3    [13000, people, receive, wildfires, evacuation...
4    [just, got, sent, this, photo, from, ruby, ala...
Name: tokenized, dtype: object

In [7]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala..."


In [8]:
df

Unnamed: 0,id,keyword,location,text,target,tokenized
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, shelter, in, place..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala..."
...,...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1,"[two, giant, cranes, holding, a, bridge, colla..."
7609,10870,,,@aria_ahrary @thetawniest the out of control w...,1,"[aria_ahrary, thetawniest, the, out, of, contr..."
7610,10871,,,m1.94 [01:04 utc]?5km s of volcano hawaii. htt...,1,"[m194, 0104, utc5km, s, of, volcano, hawaii, h..."
7611,10872,,,police investigating after an e-bike collided ...,1,"[police, investigating, after, an, ebike, coll..."


In [9]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df["tokenized"] = df["tokenized"].apply(lambda words: [word for word in words if word not in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arjun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized
0,1,,,our deeds are the reason of this #earthquake m...,1,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [11]:
stemmer = PorterStemmer()
df['stemmed'] = df['tokenized'].apply(lambda x: [stemmer.stem(y) for y in x])

In [12]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized,stemmed
0,1,,,our deeds are the reason of this #earthquake m...,1,"[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]"
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[residents, asked, shelter, place, notified, o...","[resid, ask, shelter, place, notifi, offic, ev..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13000, people, receive, wildfires, evacuation...","[13000, peopl, receiv, wildfir, evacu, order, ..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi..."


In [13]:
pos_index = {}
document = df["stemmed"]
print(type(document))

<class 'pandas.core.series.Series'>


In [14]:
my_dict = pd.Series(df.stemmed.values,index=df.id).to_dict()

In [15]:
my_dict[1]

['deed', 'reason', 'earthquak', 'may', 'allah', 'forgiv', 'us']

In [16]:
pos_ind = {}
fileno = 0
for key in my_dict:
    for i in range(len(my_dict[key])):
        term = my_dict[key][i]
        if my_dict[key][i] in pos_ind:
            pos_ind[term][0] = pos_ind[term][0] + 1
            if fileno in pos_index[term][1]:
                pos_ind[term][1][fileno].append(key)
            else:
                pos_ind[term][1][fileno] = [key]
        else:
            pos_ind[term] = []
            pos_ind[term].append(1)
            pos_ind[term].append({}) 
            pos_ind[term][1][fileno] = [key]
        fileno += 1

KeyError: 'shelter'

In [None]:
print(pos_ind)