# **Imports**

In [1]:
import os
os.chdir("../")
os.chdir(r"src")

import warnings
warnings.filterwarnings("ignore")

from Processing.preprocess_parliament import *

os.chdir("../")

# **Original dataframe filtering**

In [3]:
df_total = dd.read_csv('data/FinalDataframes/FinalDataFrame.csv', assume_missing=True, dtype={'url': 'object'})
df_total = df_total.compute()

In [4]:
df_total['text'] = df_total['text'].map(ast.literal_eval)
df_total['agenda'] = df_total['agenda'].map(ast.literal_eval)

In [5]:
df_total.reset_index(inplace=True)

In [6]:
len(df_total)

297924

In [7]:
df_total

Unnamed: 0,level_0,index,year,Speaker,party,text,source,keywords,agenda,url
0,0,4.0,2010.0,Maria Eagle,Lab,"[begin, echo, view, behalf, sorri, place, toda...",par,[],[],
1,1,5.0,2010.0,Philip Davies,Con,"[proceed, certainli, without, miss, side, give...",par,[],[],
2,2,6.0,2010.0,Maria Eagle,Lab,"[visitor, mean, drug, contraband, brought, pri...",par,[],[],
3,3,7.0,2010.0,Shailesh Vara,Con,"[join, express, sentiment, taylora, true, parl...",par,[],[],
4,4,8.0,2010.0,Maria Eagle,Lab,"[perfectli, legitim, whether, balanc, correct,...",par,[],[],
...,...,...,...,...,...,...,...,...,...,...
297919,26499,,2016.0,,,"[microsoft, launch, surfac, book, back, octob,...",TE,"['microsoft', 'surface', 'tech']","[microsoft, surfac, book, review, brilliant, h...",https://www.telegraph.co.uk/technology/2016/02...
297920,26500,,2016.0,,,"[found, three, cambridg, graduat, predict, nex...",TE,"['microsoft', 'tech']","[swiftkey, founder, sold, stake, bicycl, miss,...",https://www.telegraph.co.uk/technology/2016/02...
297921,26501,,2016.0,,,"[appl, ipad, outsold, microsoft, entir, surfac...",TE,"['ipad', 'microsoft', 'surface', 'tech']","[ipad, outsel, entir, microsoft, surfac, rang]",https://www.telegraph.co.uk/technology/2016/02...
297922,26502,,2015.0,,,"[microsoft, unveil, pinnacl, laptop, design, f...",TE,"['microsoft', 'surface', 'tech']","[best, luck, microsoft, surfac, book, isnt, save]",https://www.telegraph.co.uk/technology/2015/12...


## **Vocabulary use**

Here we use the global vocabulary of our dataframes in order to keep only the 20 000 most repeated words in our texts

In [28]:
with open('data/words/words_final.json') as f:
    words = json.load(f)

In [29]:
word_counts = Counter(words)

In [30]:
bow_sorted = sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True)

In [31]:
len(bow_sorted)

626002

In [32]:
filtrage = bow_sorted[:20000]

In [33]:
liste_filtree = [i[0] for i in filtrage]

In [34]:
def filtre(liste) :
    nouvelle_liste = []
    for word in liste :
        if word in liste_filtree :
            nouvelle_liste.append(word)
    return(nouvelle_liste)

In [35]:
df_total['text'] = df_total['text'].apply(filtre)

In [36]:
df_total

Unnamed: 0,level_0,index,year,Speaker,party,text,source,keywords,agenda,url
0,0,4.0,2010.0,Maria Eagle,Lab,"[begin, echo, view, behalf, sorri, place, toda...",par,[],[],
1,1,5.0,2010.0,Philip Davies,Con,"[proceed, certainli, without, miss, side, give...",par,[],[],
2,2,6.0,2010.0,Maria Eagle,Lab,"[visitor, mean, drug, contraband, brought, pri...",par,[],[],
3,3,7.0,2010.0,Shailesh Vara,Con,"[join, express, sentiment, true, parliamentari...",par,[],[],
4,4,8.0,2010.0,Maria Eagle,Lab,"[perfectli, legitim, whether, balanc, correct,...",par,[],[],
...,...,...,...,...,...,...,...,...,...,...
297919,26499,,2016.0,,,"[microsoft, launch, surfac, book, back, octob,...",TE,"['microsoft', 'surface', 'tech']","[microsoft, surfac, book, review, brilliant, h...",https://www.telegraph.co.uk/technology/2016/02...
297920,26500,,2016.0,,,"[found, three, cambridg, graduat, predict, nex...",TE,"['microsoft', 'tech']","[swiftkey, founder, sold, stake, bicycl, miss,...",https://www.telegraph.co.uk/technology/2016/02...
297921,26501,,2016.0,,,"[appl, ipad, outsold, microsoft, entir, surfac...",TE,"['ipad', 'microsoft', 'surface', 'tech']","[ipad, outsel, entir, microsoft, surfac, rang]",https://www.telegraph.co.uk/technology/2016/02...
297922,26502,,2015.0,,,"[microsoft, unveil, pinnacl, laptop, design, f...",TE,"['microsoft', 'surface', 'tech']","[best, luck, microsoft, surfac, book, isnt, save]",https://www.telegraph.co.uk/technology/2015/12...


We save the filtered final dataframe

In [37]:
df_total.to_csv('data/FinalDataframes/FilteredFinalDataFrame.csv', index=False)

# **Saving by year**

We save a dataframe for each year

In [38]:
for i in range(14):
    df_year = df_total.loc[df_total['year'] == (2010+i)]
    df_year.to_csv('data/FinalDataframes/FilteredFinalDataFrame_201'+str(i)+'.csv', index=False)

## Years observation for database description

Here we study the number of articles coming from the different newspapers each year in order to remove newpapers from our study of a year if its number of articles is too low

In [2]:
df_2010 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2010.csv', assume_missing=True, dtype={'url': 'object'})
df_2010 = df_2010.compute()
df_2010['text'] = df_2010['text'].map(ast.literal_eval)
df_2010['agenda'] = df_2010['agenda'].map(ast.literal_eval)

In [3]:
df_2011 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2011.csv', assume_missing=True, dtype={'url': 'object'})
df_2011 = df_2011.compute()
df_2011['text'] = df_2011['text'].map(ast.literal_eval)
df_2011['agenda'] = df_2011['agenda'].map(ast.literal_eval)

In [4]:
df_2012 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2012.csv', assume_missing=True, dtype={'url': 'object'})
df_2012 = df_2012.compute()
df_2012['text'] = df_2012['text'].map(ast.literal_eval)
df_2012['agenda'] = df_2012['agenda'].map(ast.literal_eval)

In [5]:
df_2013 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2013.csv', assume_missing=True, dtype={'url': 'object'})
df_2013 = df_2013.compute()
df_2013['text'] = df_2013['text'].map(ast.literal_eval)
df_2013['agenda'] = df_2013['agenda'].map(ast.literal_eval)

In [6]:
df_2014 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2014.csv', assume_missing=True, dtype={'url': 'object'})
df_2014 = df_2014.compute()
df_2014['text'] = df_2014['text'].map(ast.literal_eval)
df_2014['agenda'] = df_2014['agenda'].map(ast.literal_eval)

In [7]:
df_2015 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2015.csv', assume_missing=True, dtype={'url': 'object'})
df_2015 = df_2015.compute()
df_2015['text'] = df_2015['text'].map(ast.literal_eval)
df_2015['agenda'] = df_2015['agenda'].map(ast.literal_eval)

In [8]:
df_2016 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2016.csv', assume_missing=True, dtype={'url': 'object'})
df_2016 = df_2016.compute()
df_2016['text'] = df_2016['text'].map(ast.literal_eval)
df_2016['agenda'] = df_2016['agenda'].map(ast.literal_eval)

In [9]:
df_2017 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2017.csv', assume_missing=True, dtype={'url': 'object'})
df_2017 = df_2017.compute()
df_2017['text'] = df_2017['text'].map(ast.literal_eval)
df_2017['agenda'] = df_2017['agenda'].map(ast.literal_eval)

In [10]:
df_2018 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2018.csv', assume_missing=True, dtype={'url': 'object'})
df_2018 = df_2018.compute()
df_2018['text'] = df_2018['text'].map(ast.literal_eval)
df_2018['agenda'] = df_2018['agenda'].map(ast.literal_eval)

In [11]:
df_2019 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_2019.csv', assume_missing=True, dtype={'url': 'object'})
df_2019 = df_2019.compute()
df_2019['text'] = df_2019['text'].map(ast.literal_eval)
df_2019['agenda'] = df_2019['agenda'].map(ast.literal_eval)

In [12]:
df_2020 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_20110.csv', assume_missing=True, dtype={'url': 'object'})
df_2020 = df_2020.compute()
df_2020['text'] = df_2020['text'].map(ast.literal_eval)
df_2020['agenda'] = df_2020['agenda'].map(ast.literal_eval)

In [13]:
df_2021 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_20111.csv', assume_missing=True, dtype={'url': 'object'})
df_2021 = df_2021.compute()
df_2021['text'] = df_2021['text'].map(ast.literal_eval)
df_2021['agenda'] = df_2021['agenda'].map(ast.literal_eval)

In [14]:
df_2022 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_20112.csv', assume_missing=True, dtype={'url': 'object'})
df_2022 = df_2022.compute()
df_2022['text'] = df_2022['text'].map(ast.literal_eval)
df_2022['agenda'] = df_2022['agenda'].map(ast.literal_eval)

In [15]:
df_2023 = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame_20113.csv', assume_missing=True, dtype={'url': 'object'})
df_2023 = df_2023.compute()
df_2023['text'] = df_2023['text'].map(ast.literal_eval)
df_2023['agenda'] = df_2023['agenda'].map(ast.literal_eval)

In [16]:
for i in range(10):
    print(len(eval('df_201'+str(i))))

23839
27658
26912
29328
26950
25983
32787
29640
34378
29050


In [17]:
for i in range(4):
    print(len(eval('df_202'+str(i))))

3743
3517
2315
1385


In [18]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['source'] == 'DM']))

797
738
688
530


In [19]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['source'] == 'DE']))

966
778
214
93


In [20]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['source'] == 'GUA']))

832
1125
827
491


In [21]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['source'] == 'TE']))

833
532
178
150


Telegraph c'est bon que pour 2016-2019

In [22]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['source'] == 'MET']))

315
344
408
121


In [26]:
data_trace = {'The Guardian': [1374 , 1474, 1631, 1923, 1908, 1573, 1496, 1133, 987, 986, 832, 1125, 827, 491],
        'Metro' : [445,301,224,306,301,244,318,324,547,486,315,344,408,121],
        'The Telegraph' : [0,0,1,0,1,27,655,703,901,787,833,532,178,150],
        'The Daily Mail' : [5, 741, 2138, 2000, 1885, 1708, 2223, 2035, 1465, 678, 797, 738, 688, 530],
        'The Daily Express' : [183, 180, 226, 371, 403, 1579, 2493, 3196, 2363, 1605, 3743, 3517, 2315, 1385],
        }
  
df_trace = pd.DataFrame(data_trace, index=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])
df_trace.transpose()
import plotly.express as px
fig = px.line(df_trace)
fig.update_layout(
    autosize=False,
    width=500,
    height=500)
fig.show()

## By political party

In [27]:
dfp = dd.read_csv('data/FinalDataframes/FilteredFinalDataFrame.csv', assume_missing=True, dtype={'url': 'object'})
dfp = dfp.compute()
dfp['text'] = dfp['text'].map(ast.literal_eval)
dfp['agenda'] = dfp['agenda'].map(ast.literal_eval)

In [28]:
dfp = dfp[['year', 'party']]
dfp = dfp.groupby(by=['year', 'party'])['party'].count()
dfp = pd.DataFrame(dfp)
dfp['count'] = dfp['party']
dfp = dfp[['count']]
dfp.reset_index(inplace=True)
dfp = dfp.loc[dfp['party'].str.contains('Con|Lab|SNP|LibDem|DUP')]

In [30]:
import plotly.express as px
fig = px.line(dfp, x='year', y='count', color='party')
fig.update_layout(
    autosize=False,
    width=500,
    height=500)
fig.show()

## By GAFAM

In [None]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['keywords'].str.contains('meta|facebook|messenger|instagram|oculus|whatsApp|zuckerberg|olivan|clegg')]))

708
964
639
305


In [None]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['keywords'].str.contains('amazon|prime|whole-foods|zappos|pillpack|twitch|audible|goodreads|imdb|bezos|jassy|olsavsky|alexander')]))

567
545
404
267


In [None]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['keywords'].str.contains('google|android|chrome|gmail|maps|playstore|pixel|waze|youTube|alphabet|mandiant|fitbit|looker|nest|doubleclick|page|brin|pichai|kurian')]))

845
616
395
248


In [None]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['keywords'].str.contains('apple|iphone|iPad|mac|watch|macbook|ios|airpods|app-store|itunes|icloud|beats|siri|shazam|cook|jobs')]))

584
469
350
165


In [None]:
for i in range(4):
    print(len(eval('df_202'+str(i)).loc[eval('df_202'+str(i))['keywords'].str.contains('microsoft|windows|cortana|excel|explorer|office|edge|teams|outlook|powerpoint|skype|surface|word|xbox|linkedIn|github|mojang|gates|nadella')]))

1063
887
365
263


In [32]:
data_trace = {'Facebook': [382 , 464, 1213, 1142, 1277, 1570, 2197, 1442, 1535, 991, 708 , 964, 639, 305],
        'Amazon' : [72, 147, 214, 334, 430, 481, 677, 1169, 1215, 1042, 567, 545, 404, 267],
        'Google' : [610, 655, 1010, 1388, 1144, 1292, 1450, 1548, 1361, 972, 845, 616, 395, 248],
        'Apple' : [274, 680, 1120, 1056, 1007, 975, 1567, 1746, 1254, 636, 584, 469, 350, 165],
        'Microsoft' : [220, 269, 318, 330, 334, 664, 1119, 1228, 785, 927, 1063, 887, 365, 263],
        }
  
df_trace = pd.DataFrame(data_trace, index=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])
df_trace.transpose()
import plotly.express as px
fig = px.line(df_trace)
fig.update_layout(
    autosize=False,
    width=500,
    height=500)
fig.show()

## Number of words from parliament / newpapers

In [None]:
df_p = df_total.loc[df_total['source'] == 'par']
words_par = []
for i in df_p['text']:
    words_par += i

In [None]:
df_j = df_total.loc[df_total['source'] != 'par']
words_j = []
for i in df_j['text']:
    words_j += i

In [None]:
len(words_par)/len(words_j)

2.124843863274066