In [49]:
import os
import numpy as np
import pandas as pd
import re
import math
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.ru.stop_words import STOP_WORDS

In [2]:
# import data
somi = pd.read_pickle('/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/corpus_president_of_russia/statements_on_major_issues.pkl')
somi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36658 entries, 0 to 37587
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        36658 non-null  object
 1   url         36658 non-null  object
 2   title       36658 non-null  object
 3   speaker     36658 non-null  object
 4   text        36658 non-null  object
 5   text_clean  36658 non-null  object
dtypes: object(6)
memory usage: 2.0+ MB


In [3]:
mtfa = pd.read_pickle('/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/corpus_president_of_russia/messages_to_federal_assembly.pkl')
mtfa.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        14 non-null     object
 1   URL         14 non-null     object
 2   title       14 non-null     object
 3   speaker     14 non-null     object
 4   text        14 non-null     object
 5   text_clean  14 non-null     object
dtypes: object(6)
memory usage: 800.0+ bytes


In [4]:
interviews = pd.read_pickle('/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/corpus_president_of_russia/interviews.pkl')
interviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9642 entries, 0 to 9641
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        9642 non-null   object
 1   url         9642 non-null   object
 2   title       9642 non-null   object
 3   speaker     9642 non-null   object
 4   text        9642 non-null   object
 5   text_clean  9642 non-null   object
dtypes: object(6)
memory usage: 452.1+ KB


In [5]:
news = pd.read_excel('data/putin_corpus.xlsx')
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33189 entries, 0 to 33188
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          33189 non-null  object
 1   URL           33189 non-null  object
 2   description   30316 non-null  object
 3   introduction  30316 non-null  object
 4   id            33189 non-null  int64 
 5   speaker       33189 non-null  object
 6   p             33189 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.8+ MB


In [6]:
lavrov = pd.read_pickle('/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/corpus_lavrov/lavrov_clean.pkl')
lavrov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5208 entries, 0 to 5212
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   speaker     5208 non-null   object        
 1   date        5208 non-null   datetime64[ns]
 2   url         5208 non-null   object        
 3   title       5208 non-null   object        
 4   text        5208 non-null   object        
 5   word_count  5208 non-null   int64         
 6   lemma       5208 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(5)
memory usage: 325.5+ KB


In [7]:
lavrov.rename(columns={'text':'text_clean'}, inplace = True)

In [8]:
#count the urls for each dataset
print(f"somi_url : {len(somi.url.value_counts())}")
print(f"mtfa_url : {len(mtfa.URL.value_counts())}")
print(f"interviews_url : {len(interviews.url.value_counts())}")
print(f"news_url : {len(news.URL.value_counts())}")
print(f"lavrov_url : {len(lavrov.url.value_counts())}")

somi_url : 533
mtfa_url : 14
interviews_url : 114
news_url : 871
lavrov_url : 5208


In [9]:
def clean(text):
    # replace actual newline characters \n with a space
    text = re.sub(r'\n', ' ', text)
    #replace Д.Медведев:|В.Путин: with a white space
    text = re.sub(r'(Д\.Медведев:|В\.Путин:)', ' ', text)
    # replace everything non-alphanumeric with a space
    text = re.sub(r'\W+', ' ', text)
    # replace two or more dots with one
    text = re.sub(r'\.{2,}', ' ', text)
    # replace sequences of white spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # convert text to lowercase
    text = text.lower()
    # replace \xa0 with a white space
    text = re.sub(r'\xa0', ' ', text)
    return text.strip()

In [10]:
news['text_clean'] = news['p'].map(clean)

In [11]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33189 entries, 0 to 33188
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          33189 non-null  object
 1   URL           33189 non-null  object
 2   description   30316 non-null  object
 3   introduction  30316 non-null  object
 4   id            33189 non-null  int64 
 5   speaker       33189 non-null  object
 6   p             33189 non-null  object
 7   text_clean    33189 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.0+ MB


In [12]:
# for each dataset, add a col word_count
#combine the dfs in a list
dfs = [somi, mtfa, interviews, news,lavrov]
# iterate over each dataframe and extract the word count for the 'text_clean' column and cut row where word_count == 0
for df in dfs:
    df['word_count'] = df['text_clean'].apply(lambda x: len(x.split()))
    df.drop(df[df['word_count'] == 0].index, inplace=True)


In [13]:
#how many rows are >250 words
for df in dfs:
    count = df[df['word_count'] > 250]['word_count'].count()
    print(f"{count} rows in {len(df)} have word_count greater than 250.")

18 rows in 36547 have word_count greater than 250.
14 rows in 14 have word_count greater than 250.
9 rows in 9380 have word_count greater than 250.
4 rows in 33188 have word_count greater than 250.
3808 rows in 5207 have word_count greater than 250.


### split the text_clean into paragraphs

In [14]:
def split_into_chunks(text, n):
    words = text.split()
    return [' '.join(words[i:i+n]) for i in range(0, len(words), n)]

In [15]:
# apply the function to the 'text' column of the mtfa
chunk_size = 250
mtfa['chunks'] = mtfa['text_clean'].apply(lambda x: split_into_chunks(x, chunk_size))
# unnest the chunks column to create a new row for each chunk
new_mtfa = mtfa.explode('chunks')
# reset the index of the dataframe
new_mtfa = new_mtfa.reset_index(drop=True)
new_mtfa['word_count_chunk'] = new_mtfa['chunks'].apply(lambda x: len(x.split()))

In [16]:
# apply the function to the 'text' column of the lavrov
chunk_size = 250
lavrov['chunks'] = lavrov['text_clean'].apply(lambda x: split_into_chunks(x, chunk_size))
# unnest the chunks column to create a new row for each chunk
new_lavrov = lavrov.explode('chunks')
# reset the index of the dataframe
new_lavrov = new_lavrov.reset_index(drop=True)
new_lavrov['word_count_chunk'] = new_lavrov['chunks'].apply(lambda x: len(x.split()))

In [17]:
# apply the function to the 'text' column of the somi
chunk_size = 250
somi['chunks'] = somi['text_clean'].apply(lambda x: split_into_chunks(x, chunk_size))
# unnest the chunks column to create a new row for each chunk
new_somi = somi.explode('chunks')
# reset the index of the dataframe
new_somi = new_somi.reset_index(drop=True)
new_somi['word_count_chunk'] = new_somi['chunks'].apply(lambda x: len(x.split()))

In [18]:
# apply the function to the 'text' column of the interviews
chunk_size = 250
interviews['chunks'] = interviews['text_clean'].apply(lambda x: split_into_chunks(x, chunk_size))
# unnest the chunks column to create a new row for each chunk
new_interviews = interviews.explode('chunks')
# reset the index of the dataframe
new_interviews = new_interviews.reset_index(drop=True)
new_interviews['word_count_chunk'] = new_interviews['chunks'].apply(lambda x: len(x.split()))

In [19]:
# apply the function to the 'text' column of the news
chunk_size = 250
news['chunks'] = news['text_clean'].apply(lambda x: split_into_chunks(x, chunk_size))
# unnest the chunks column to create a new row for each chunk
new_news = news.explode('chunks')
# reset the index of the dataframe
new_news= new_news.reset_index(drop=True)
new_news['word_count_chunk'] = new_news['chunks'].apply(lambda x: len(x.split()))

In [20]:
dfs = [new_interviews, new_lavrov, new_mtfa, new_somi, new_news]
for df in dfs:
    count = df[df['word_count_chunk'] > 250]['word_count_chunk'].count()
    print(f"{count} rows in {len(df)} have word_count greater than 250.")

0 rows in 9389 have word_count greater than 250.
0 rows in 24947 have word_count greater than 250.
0 rows in 488 have word_count greater than 250.
0 rows in 36565 have word_count greater than 250.
0 rows in 33192 have word_count greater than 250.


In [29]:
#drop lemma col on lavrov
new_lavrov = new_lavrov.drop('lemma', axis=1)

### tokens and stopwords

In [22]:
nlp_spacy = spacy.load('ru_core_news_sm')
nlp_spacy.disable_pipe("parser")
nlp_spacy.enable_pipe("senter")
nlp_spacy.enable_pipe("ner")

In [36]:
#Stopwords
from spacy.lang.ru import stop_words
nlp_spacy.Defaults.stop_words |= { 'два','день','дорогой','добрый','коллега','раз','сегодня','спасибо','уважаемый','уважаемые'}
stop_words = stop_words.STOP_WORDS

#### Token, POS, Lemma

In [37]:
def process_docs(docs):
    for doc in nlp_spacy.pipe(docs, batch_size=50, n_process=-1):
        tokens = [token.text.lower() for token in doc]
        lemma = [token.lemma_.lower() for token in doc if token.lemma_.lower() not in stop_words]
        pos = [token.pos_ for token in doc]
        yield tokens, lemma, pos

In [39]:
new_mtfa['tokens'], new_mtfa['lemma'], new_mtfa['pos'] = zip(*process_docs(new_mtfa['chunks'].astype('unicode').values))

In [41]:
new_somi['tokens'], new_somi['lemma'], new_somi['pos'] = zip(*process_docs(new_somi['chunks'].astype('unicode').values))

In [43]:
new_interviews['tokens'], new_interviews['lemma'], new_interviews['pos'] = zip(*process_docs(new_interviews['chunks'].astype('unicode').values))

In [45]:
new_news['tokens'], new_news['lemma'], new_news['pos'] = zip(*process_docs(new_news['chunks'].astype('unicode').values))

In [46]:
new_lavrov['tokens'], new_lavrov['lemma'], new_lavrov['pos'] = zip(*process_docs(new_lavrov['chunks'].astype('unicode').values))

In [47]:
new_lavrov[['tokens','lemma','pos']].head()

Unnamed: 0,tokens,lemma,pos
0,"[я, наверное, не, буду, выступать, с, речью, м...","[наверное, выступать, речь, продуктивный, побе...","[PRON, ADV, PART, AUX, VERB, ADP, NOUN, PRON, ..."
1,"[обсуждали, в, числе, прочего, ситуацию, в, ли...","[обсуждать, число, ситуация, ливия, разуметься...","[VERB, ADP, NOUN, ADJ, NOUN, ADP, PROPN, CCONJ..."
2,"[сказать, по, поводу, игил, наиболее, боеспосо...","[сказать, повод, игил, боеспособный, часть, иг...","[VERB, ADP, NOUN, PROPN, ADV, ADJ, NOUN, PROPN..."
3,"[в, лавров, этим, давно, уже, пора, было, заня...","[лавр, заняться, буквально, состояться, коротк...","[ADP, NOUN, PRON, ADV, ADV, NOUN, AUX, VERB, A..."
4,"[коалицией, в, нее, вошли, страны, которые, ре...","[коалиция, войти, страна, решить, поддерживать...","[NOUN, ADP, PRON, VERB, NOUN, PRON, VERB, VERB..."


In [48]:
dfs = [new_interviews, new_lavrov, new_mtfa, new_news, new_somi]

In [52]:
new_interviews.to_pickle('/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/token_lemma_pos/new_interviews.pkl')

In [55]:
output_dir = '/Users/adrianacuppuleri/Desktop/GITHUB ADRIANA/Illiberal_discourse/data/corpus_adriana/token_lemma_pos'
names = ['new_interviews', 'new_lavrov', 'new_mtfa', 'new_news', 'new_somi']

for name, df in zip(names, dfs):
    # construct the output file path for the pickle file
    output_path = os.path.join(output_dir, f"{name}.pkl")
    # save the DataFrame to a pickle file
    df.to_pickle(output_path)

In [56]:
new_interviews = pd.read_pickle('data/corpus_adriana/token_lemma_pos/new_interviews.pkl')
new_interviews.head()

Unnamed: 0,date,url,title,speaker,text,text_clean,word_count,chunks,word_count_chunk,tokens,lemma,pos
0,2014-03-06,http://kremlin.ru/events/president/transcripts...,Интервью российским и иностранным СМИ в преддв...,Вопрос:,"С точки зрения внимания общественности и СМИ, ...",с точки зрения внимания общественности и сми л...,30,с точки зрения внимания общественности и сми л...,30,"[с, точки, зрения, внимания, общественности, и...","[точка, зрение, внимание, общественность, сми,...","[ADP, NOUN, NOUN, NOUN, NOUN, CCONJ, NOUN, ADJ..."
1,2014-03-06,http://kremlin.ru/events/president/transcripts...,Интервью российским и иностранным СМИ в преддв...,В.Путин:,"Начнём с того, что Великобританию принято счит...",начнём с того что великобританию принято счита...,81,начнём с того что великобританию принято счита...,81,"[начнём, с, того, что, великобританию, принято...","[начнём, великобританию, принять, считать, род...","[VERB, ADP, PRON, SCONJ, NOUN, VERB, VERB, NOU..."
2,2014-03-06,http://kremlin.ru/events/president/transcripts...,Интервью российским и иностранным СМИ в преддв...,В.Путин:,Наша страна принимает участие в Паралимпиадах ...,наша страна принимает участие в паралимпиадах ...,30,наша страна принимает участие в паралимпиадах ...,30,"[наша, страна, принимает, участие, в, паралимп...","[страна, принимать, участие, паралимпиадах, во...","[DET, NOUN, VERB, NOUN, ADP, NOUN, ADP, ADJ, N..."
3,2014-03-06,http://kremlin.ru/events/president/transcripts...,Интервью российским и иностранным СМИ в преддв...,В.Путин:,"В целом паралимпийский спорт, его замечательны...",в целом паралимпийский спорт его замечательные...,40,в целом паралимпийский спорт его замечательные...,40,"[в, целом, паралимпийский, спорт, его, замечат...","[целое, паралимпийский, спорт, замечательный, ...","[ADP, NOUN, ADJ, NOUN, DET, ADJ, NOUN, VERB, P..."
4,2014-03-06,http://kremlin.ru/events/president/transcripts...,Интервью российским и иностранным СМИ в преддв...,В.Путин:,"Кроме того, мы будем масштабно транслировать э...",кроме того мы будем масштабно транслировать эт...,43,кроме того мы будем масштабно транслировать эт...,43,"[кроме, того, мы, будем, масштабно, транслиров...","[масштабно, транслировать, соревнование, между...","[ADP, PRON, PRON, AUX, ADV, VERB, DET, NOUN, A..."
