In [1]:
import gensim
from glob import glob
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pickle

In [2]:
#For removing punctuation
table = str.maketrans('', '', string.punctuation)
np = pd.np
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tqdm_notebook(disable = True).pandas()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aashish_jain/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def read_articles(path, show_progress = True):
    df_list = []
    for file_name in tqdm_notebook(glob(path), disable = not show_progress):
            temp_df = pd.read_csv(file_name, index_col=0)
            temp_df["date"] = file_name.split("/")[-1].split('.')[0]
            df_list.append(temp_df)
    df = pd.concat(df_list, ignore_index=True)
    try:
        df["date"] = pd.to_datetime(df["date"])
    except:
        print("Unable to convert the date string to date")
        pass
    return df

In [7]:
toi_df = read_articles("../../TOI/*.csv")
train_data = toi_df[toi_df["date"] < pd.to_datetime("1-Jan-2019")]
test_data = toi_df[toi_df["date"] >= pd.to_datetime("1-Jan-2019")]

HBox(children=(IntProgress(value=0, max=1197), HTML(value='')))




In [5]:
# augmented_data = read_articles("/data/ateendra/all-the-news/*.csv")
# augmented_data = augmented_data[['title','content']]
# # augmented_data.columns = ['title','text']
# train_data = pd.concat([train_data, augmented_data], sort=False)
# train_data.index = np.arange(len(train_data))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Unable to the date string to date


In [11]:
train_data.index = np.arange(len(train_data))
test_data.index = np.arange(len(test_data))

In [13]:
def generate_document_vocabulary(text):
    vocabulary = []
    for word in word_tokenize(text):
        w = word.translate(table).lower()
        if w.isalpha() and w not in stop_words:
            vocabulary.append(w)
    return vocabulary

In [14]:
train_data['vocabulary'] = train_data['text'].progress_apply(generate_document_vocabulary)

HBox(children=(IntProgress(value=0, max=45406), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
documents = []

for i, row in train_data.iterrows():
    document = TaggedDocument(row['vocabulary'], [i])
    documents.append(document)

print("Documents to train on is", len(documents))

Documents to train on is 45406


In [16]:
pickle.dump(documents, open("tagged_doc_dump.pkl","wb"))

In [18]:
max_epochs = 30
vec_size = 50
alpha = 0.025

# Distributed memory and not distributed bag of words
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1,
                workers=8)

In [19]:
model.build_vocab(documents)

In [20]:
for _ in tqdm_notebook(range(max_epochs)):
    model.train(documents,
                total_examples=model.corpus_count,
                epochs=model.epochs,)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

HBox(children=(IntProgress(value=0, max=30), HTML(value='')))




In [21]:
model.save("article.d2v")

In [None]:
model.docvecs.similarity(test_data['voac'], test_documents[1])

In [26]:
acled_data = pd.read_csv("../data/ACLED-India-27-Apr-18.csv")

In [27]:
acled_data

Unnamed: 0,data_id,iso,event_id_cnty,event_id_no_cnty,event_date,year,time_precision,event_type,sub_event_type,actor1,...,location,latitude,longitude,geo_precision,source,source_scale,notes,fatalities,timestamp,iso3
0,5152749,356,IND51288,51288,27 April 2019,2019,1,Battles,Armed clash,CPI (Maoist): Communist Party of India (Maoist),...,Timmapur,18.5352,80.8859,2,Times of India,National,"On 27 April, two police officers were killed a...",2,1556628778.0,IND
1,5153022,356,IND51297,51297,27 April 2019,2019,1,Riots,Violent demonstration,Rioters (India),...,Kishtwar,33.3135,75.7673,1,Kashmir Times,Subnational,"On 27 Apr, rioters, led by the local religious...",0,1556628778.0,IND
2,5153033,356,IND51299,51299,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Jammu,32.7357,74.8691,1,Early Times (India),Subnational,"On 27 Apr, family members of a deceased woman ...",0,1556628778.0,IND
3,5153038,356,IND51300,51300,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Jammu,32.7357,74.8691,1,Early Times (India),Subnational,"On 27 Apr, residents of Trikuta Nagar in Jammu...",0,1556628778.0,IND
4,5153039,356,IND51294,51294,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Jammu,32.7357,74.8691,1,Early Times (India),Subnational,"On 27 Apr, the VRS Union staged a protest at t...",0,1556628778.0,IND
5,5153045,356,IND51298,51298,27 April 2019,2019,1,Riots,Mob violence,Rioters (India),...,Salwah,33.6017,74.2113,1,Kashmir Times,Subnational,"On 27 Apr, rioters clashed with police in Salw...",0,1556628778.0,IND
6,5153052,356,IND51295,51295,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Tangmarg,34.0611,74.4257,1,Kashmir News Service,Subnational,"On 27 Apr, trader associations from the Tangma...",0,1556628778.0,IND
7,5153066,356,IND51296,51296,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Srinagar,34.0857,74.8056,1,Rising Kashmir,National,"On 27 Apr, junior assistant and stenotypes asp...",0,1556628778.0,IND
8,5153071,356,IND51301,51301,27 April 2019,2019,1,Protests,Peaceful protest,Protesters (India),...,Kathua,32.3867,75.5174,1,Daily Excelsior,National,"On 27 Apr, local residents staged a protest at...",0,1556628778.0,IND
9,5153072,356,IND51302,51302,27 April 2019,2019,1,Battles,Armed clash,Bishnah Communal Militia (India),...,Bishnah,32.6106,74.8556,2,Daily Excelsior,Subnational,"On 27 Apr, nine people were injured during an ...",0,1556628778.0,IND


In [67]:
article = "On July 15, a long protest march by farmers, from Mandsaur in Madhya Pradesh to New Delhi, demanding loan waiver and fair price for their produce, reached Jaipur."

In [76]:
# article = ' '.join(generate_document_vocabulary(article))
article = ' '.join(train_data['vocabulary'].loc[0])

In [77]:
inferred_vector = model.infer_vector([article])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

In [80]:
train_data.vocabulary.loc[0]

['srinagar',
 'peoples',
 'democratic',
 'party',
 'pdp',
 'president',
 'mehbooba',
 'mufti',
 'sunday',
 'said',
 'choking',
 'democratic',
 'space',
 'punishing',
 'people',
 'collectively',
 'new',
 'policy',
 'adopted',
 'centre',
 'deal',
 'kashmirthe',
 'purpose',
 'behind',
 'repressive',
 'measures',
 'taken',
 'jammu',
 'kashmir',
 'bjpled',
 'government',
 'show',
 'electorate',
 'tough',
 'saffron',
 'party',
 'kashmiris',
 'mufti',
 'claimed',
 'choking',
 'democratic',
 'space',
 'punishing',
 'people',
 'collectively',
 'unfortunately',
 'new',
 'policy',
 'adopted',
 'government',
 'india',
 'deal',
 'kashmir',
 'former',
 'chief',
 'minister',
 'said',
 'pdp',
 'youth',
 'convention',
 'herereferring',
 'ban',
 'imposed',
 'jklf',
 'jamaateislami',
 'jammu',
 'kashmir',
 'said',
 'ironic',
 'one',
 'hand',
 'central',
 'government',
 'claims',
 'launched',
 'rehabilitation',
 'scheme',
 'terrorists',
 'lay',
 'arms',
 'organisations',
 'like',
 'jammat',
 'jklf',
 'rej

In [78]:
sims

[(17427, 0.5008977651596069),
 (5395, 0.47333860397338867),
 (35413, 0.4725831151008606),
 (22930, 0.466962993144989),
 (40266, 0.448144793510437),
 (2112, 0.4463515281677246),
 (40747, 0.44367122650146484),
 (35499, 0.4425381124019623),
 (14411, 0.4419245421886444),
 (7118, 0.44013839960098267),
 (27979, 0.4398616552352905),
 (34085, 0.43917006254196167),
 (28969, 0.43916821479797363),
 (17293, 0.4362945854663849),
 (12326, 0.436016708612442),
 (45232, 0.4349808692932129),
 (27864, 0.43325191736221313),
 (28245, 0.4316686987876892),
 (42142, 0.430948942899704),
 (29173, 0.42854657769203186),
 (42583, 0.42552709579467773),
 (38734, 0.42516520619392395),
 (39534, 0.4250064492225647),
 (6208, 0.4234922528266907),
 (28244, 0.4225272536277771),
 (42466, 0.42221054434776306),
 (34910, 0.4198230504989624),
 (1650, 0.4195080101490021),
 (26327, 0.4178103804588318),
 (2197, 0.41751593351364136),
 (41750, 0.4167533814907074),
 (41713, 0.41581499576568604),
 (5296, 0.41516363620758057),
 (5182, 

In [79]:
' '.join(documents[17427].words)

'new delhi state thursday sought death penalty two men convicted murder heinous charges antisikh riots case prosecution submitted court killings premeditated part genocide sikhsasserting rarest rare case victims deprived justice years public prosecutor k kain sit lawyer surinder mohit singh urged additional sessions judge ajay pandey award capital punishment defence hand asked court leniency fate convicts hangs sentencing november two men killed communal frenzy november hardev singh avtar singh court convicted two men killings making first conviction secured sit constituted centre probe riot caseskain submitted convicted sehrawat yashpal caused brutal murders wounded three others brought kerosene sticks stones etc case falls within aggravating circumstances committed planning involved extreme cruelty prosecution arguedthe prosecution also submitted crime exhibited exceptional depravity thousands innocent people murdered properties looted delhi incidents affected society large sikh comm

In [34]:
document_dic = {}
for doc,tag in documents:
    document_dic[tag[0]] = doc

In [None]:
sims

In [None]:
document_dic[]

In [58]:
words = '\n'.join(list(model.wv.vocab.keys()))

In [47]:


ranks = []
second_ranks = []
for doc_id, row in train_data.iterrows():
    inferred_vector = model.infer_vector(' '.join(train_data['vocabulary']))
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)
    
    second_ranks.append(sims[1])



TypeError: sequence item 0: expected str instance, list found

In [31]:
sabri = [row['notes'] for _,row in acled_data.iterrows() if 'sabri' in row['notes'].lower()]

In [35]:
sabri

['On October 17, near Sabrimala (Pathanamthitta, Kerala), police clashed with over 100 Hindu devotees who were prevented from gaining entry into the Lord Ayyappa temple. The number of injuries was not reported.']

In [66]:
sabri = generate_document_vocabulary(sabri[0])

In [67]:
vec = model.infer_vector(sabri)

In [68]:
sims = model.docvecs.most_similar([vec])

In [40]:
document_dic[52427]

KeyError: 52427

In [36]:
document_dic[128864]

KeyError: 128864

In [69]:
sims

[(47841, 0.8112000226974487),
 (69038, 0.8092457056045532),
 (52428, 0.8083778619766235),
 (128864, 0.8016010522842407),
 (52590, 0.800187885761261),
 (53313, 0.7976787090301514),
 (52427, 0.7958401441574097),
 (110452, 0.79188472032547),
 (45328, 0.7900034189224243),
 (54048, 0.7822529077529907)]

In [77]:
documents[0].tags

[0]

In [79]:
document_dic = {}
for doc, tags in documents:
    document_dic[tags[0]] = doc

In [83]:
document_dic[53313]

['see',
 'fox',
 'news',
 'battleground',
 'prediction',
 'map',
 'make',
 'election',
 'projections',
 'see',
 'predictions',
 'map']