# Data Analysis

In [None]:
# Python library imports
# From ADA tutorial on NLP
%load_ext autoreload
%autoreload 2

import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline  

seed = 42
random.seed(seed)
np.random.seed(seed)

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# print redirect
from contextlib import redirect_stdout

 ## Import

In [None]:
df = pd.read_csv('../data/data.csv')
df = df[df.journal != 'NZZ']

start_date = '1914-08-00'
end_date = '1914-10-00'
keywords = [
        'Belgique'
]
filters =  pd.Series([True for i in range(0, len(df))])
for keyword in keywords:
    filters &= df.fulltext.str.contains(keyword)
df_filtered = df[filters
        & (df.date >= start_date)
        & (df.date <= end_date)]

print(len(df_filtered), len(df))

## Français

In [None]:
df = pd.read_csv('../data/data.csv')
df = df[df.journal != 'NZZ']

start_date = '1914-08-00'
end_date = '1914-10-00'
keywords = [
        'Belgique'
]
filters =  pd.Series([True for i in range(0, len(df))])
for keyword in keywords:
    filters &= df.fulltext.str.contains(keyword)
df_filtered = df[filters
        & (df.date >= start_date)
        & (df.date <= end_date)]

print(len(df_filtered), len(df))

In [None]:
text = "".join(df_filtered[['title', 'fulltext']].apply(lambda x :' '.join(x.astype(str)),1).to_numpy())
len(text)

In [None]:
nlp = spacy.load('fr')

In [None]:
nlp.max_length = 3000000
doc = nlp(text)

In [None]:
pipeline = [(token.lemma_, token.pos_) for token in doc if not token.is_stop]
pipeline = [n for n, type in pipeline if type not in ['PUNCT', 'DET', 'ADP', 'PRON', 'SCONJ']]
pipeline = list([(t, pipeline.count(t)) for t in set(pipeline)])
pipeline.sort(key=(lambda el: el[1]), reverse=True)
with open('results.txt', 'w+') as f:
    with redirect_stdout(f):
        for n, c in pipeline:
            print(f"{c:4d}\t\t{n}")

## Deutsch

In [None]:
dfde = pd.read_csv('../data/data.csv')
dfde = dfde[dfde.journal == 'NZZ']

start_date = '1914-08-00'
end_date = '1914-10-00'

keywords = [
        'Belgien'
]
filters =  pd.Series([True for i in range(0, len(df))])
for keyword in keywords:
    filters &= dfde.fulltext.str.contains(keyword)
filters

dfde_filtered = dfde[filters
        & (dfde.date >= start_date)
        & (dfde.date <= end_date)]
print(len(dfde_filtered), len(dfde))

In [None]:
textde = "".join(dfde_filtered[['title', 'fulltext']].apply(lambda x :' '.join(x.astype(str)),1).to_numpy())
len(textde)

In [None]:
nlpde = spacy.load('de')

In [None]:
nlpde.max_length = 8000000
docde = nlpde(textde)

In [None]:
pipeline = [(token.lemma_, token.pos_) for token in docde if not token.is_stop]
pipeline = [n for n, type in pos_tagged if type not in ['PUNCT', 'DET', 'ADP', 'PRON', 'SCONJ']]
pipeline = list([(t, pipeline.count(t)) for t in set(pipeline)])
pipeline.sort(key=(lambda el: el[1]), reverse=True)
with open('results.txt', 'w+') as f:
    with redirect_stdout(f):
        for n, c in pipeline:
            print(f"{c}\t\t{n}")