In [None]:
!pip install beautifulsoup4 nltk scikit-learn num2words

In [None]:
import pandas as pd
import glob
import csv
import re
import nltk
import numpy as np
import pickle

nltk.download('all')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from num2words import num2words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from datetime import datetime

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [64]:
input_data = 'data/0_isw_reports'
output_data = 'data/1_raw_isw_data'
output_file = 'raw_isw_report.csv'


files = glob.glob(f'{input_data}\*.html')

all_data = []

for file in files:
    
    d = {}

    fname = file.replace(f'{input_data}\\', '').replace('.html', '')
    date = datetime.strptime(fname, '%Y-%m-%d').date()
    
    with open (file, "r") as current_file:
        parsed_html = BeautifulSoup(current_file.read())
        title = parsed_html.head.find('title').text
        link = parsed_html.head.find('link', attrs ={'rel':'canonical'}, href=True).attrs["href"]
        
        text_title =  parsed_html.body.find ("h1", attrs={'id':'page-title'}).text
        text_main_raw = parsed_html.body.find ("div", attrs={'class':'field field-name-body field-type-text-with-summary field-label-hidden'})
        
        initials = text_main_raw.select('p:nth-of-type(2)')
        initials[0].replaceWith('')
        text_main = text_main_raw.text
        
        d = {
            "date":date,
            "title":title,
            "text_title":text_title,
            "main_html":text_main
        }
        
        all_data.append(d)

In [65]:
df = pd.DataFrame.from_dict(all_data)
df = df.sort_values(by = ['date'])
df.head(10)

Unnamed: 0,date,title,text_title,main_html
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,"\n\nFebruary 24, 3:00 pm EST\nRussian Presid..."
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil..."
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","\n\nMarch 1, 3:00 pm EST\nRussian forces are..."
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2","\n\nMarch 2, 4:30 pm EST\nRussian forces res..."
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\n..."
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\n..."
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...","\n\nMarch 5, 2022\nThe early announcement of..."


In [67]:
df.to_csv(f"{output_data}/{output_file}", sep = ';', index = False) 

In [69]:
#select some row from previously created .csv file and print it to see which info we need to extract
some_row = df.iloc[46]
some_row_in_html = some_row['main_html']
print(some_row_in_html)

  

April 11, 8:30pm ET
Special Edition: Army General Aleksandr Vladimirovich Dvornikov
US intelligence reported over the weekend of April 9-10 that Russian Army General Aleksandr Vladimirovich Dvornikov, commander of the Southern Military District, is now in overall command of Russian operations in Ukraine. This news is unsurprising; Dvornikov is the most senior of the three Russian military district commanders involved in the invasion, and the Russian military is concentrating its efforts almost exclusively in the area of Ukraine that Dvornikov had already been commanding. Had Putin selected another officer to command the entire war effort, he would likely have had to relieve Dvornikov for these reasons. There is no reason to suppose, therefore, that Dvornikov was specifically selected to take control of the war effort for any particular skills or experience he might have. Nor is there reason to think that the conduct of the Russian war effort will change materially more than it was 

## Delete all [n] symbols

In [71]:
df['main_html_1'] = df['main_html'].apply(lambda x : re.sub('\[(\d+)\]', "", x))
page_clean = df.iloc[46]['main_html_1']
print(page_clean)

  

April 11, 8:30pm ET
Special Edition: Army General Aleksandr Vladimirovich Dvornikov
US intelligence reported over the weekend of April 9-10 that Russian Army General Aleksandr Vladimirovich Dvornikov, commander of the Southern Military District, is now in overall command of Russian operations in Ukraine. This news is unsurprising; Dvornikov is the most senior of the three Russian military district commanders involved in the invasion, and the Russian military is concentrating its efforts almost exclusively in the area of Ukraine that Dvornikov had already been commanding. Had Putin selected another officer to command the entire war effort, he would likely have had to relieve Dvornikov for these reasons. There is no reason to suppose, therefore, that Dvornikov was specifically selected to take control of the war effort for any particular skills or experience he might have. Nor is there reason to think that the conduct of the Russian war effort will change materially more than it was 

## Delete all links

In [73]:
df['main_html_2'] = df['main_html_1'].str.replace('\sdot ', '.', regex=True).apply(lambda x : re.sub('http://\S+|https://\S+|.$html', "", x))
page_clean_1 = df.iloc[46]['main_html_2']
print(page_clean_1)
df.head(10)

  

April 11, 8:30pm ET
Special Edition: Army General Aleksandr Vladimirovich Dvornikov
US intelligence reported over the weekend of April 9-10 that Russian Army General Aleksandr Vladimirovich Dvornikov, commander of the Southern Military District, is now in overall command of Russian operations in Ukraine. This news is unsurprising; Dvornikov is the most senior of the three Russian military district commanders involved in the invasion, and the Russian military is concentrating its efforts almost exclusively in the area of Ukraine that Dvornikov had already been commanding. Had Putin selected another officer to command the entire war effort, he would likely have had to relieve Dvornikov for these reasons. There is no reason to suppose, therefore, that Dvornikov was specifically selected to take control of the war effort for any particular skills or experience he might have. Nor is there reason to think that the conduct of the Russian war effort will change materially more than it was 

Unnamed: 0,date,title,text_title,main_html,main_html_1,main_html_2
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,"\n\nFebruary 24, 3:00 pm EST\nRussian Presid...","\n\nFebruary 24, 3:00 pm EST\nRussian Presid...","\n\nFebruary 24, 3:00 pm EST\nRussian Presid..."
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step...","Mason Clark, George Barros, and Kateryna Step...","Mason Clark, George Barros, and Kateryna Step..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane...","Mason Clark, George Barros, and Katya Stepane...","Mason Clark, George Barros, and Katya Stepane..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit...","\n\nFebruary 27, 4pm EST\n\nThe Russian milit...","\n\nFebruary 27, 4pm EST\n\nThe Russian milit..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil..."
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","\n\nMarch 1, 3:00 pm EST\nRussian forces are...","\n\nMarch 1, 3:00 pm EST\nRussian forces are...","\n\nMarch 1, 3:00 pm EST\nRussian forces are..."
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2","\n\nMarch 2, 4:30 pm EST\nRussian forces res...","\n\nMarch 2, 4:30 pm EST\nRussian forces res...","\n\nMarch 2, 4:30 pm EST\nRussian forces res..."
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n..."
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n..."
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...","\n\nMarch 5, 2022\nThe early announcement of...","\n\nMarch 5, 2022\nThe early announcement of...","\n\nMarch 5, 2022\nThe early announcement of..."


## Delete all dates


In [75]:
df['main_html_3'] = df['main_html_2'].apply(lambda y: re.sub(r'\n.{5,15}\d:\d.{0,9}', "", y))
page_clean_2 = df.iloc[46]['main_html_3']
#print(page_clean_2)
df.head(10)

Unnamed: 0,date,title,text_title,main_html,main_html_1,main_html_2,main_html_3
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,"\n\nFebruary 24, 3:00 pm EST\nRussian Presid...","\n\nFebruary 24, 3:00 pm EST\nRussian Presid...","\n\nFebruary 24, 3:00 pm EST\nRussian Presid...",\n\nRussian President Vladimir Putin began a...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step...","Mason Clark, George Barros, and Kateryna Step...","Mason Clark, George Barros, and Kateryna Step...","Mason Clark, George Barros, and Kateryna Step..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane...","Mason Clark, George Barros, and Katya Stepane...","Mason Clark, George Barros, and Katya Stepane...","Mason Clark, George Barros, and Katya Stepane..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit...","\n\nFebruary 27, 4pm EST\n\nThe Russian milit...","\n\nFebruary 27, 4pm EST\n\nThe Russian milit...","\n\nFebruary 27, 4pm EST\n\nThe Russian milit..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil...","\n\nFebruary 28, 3:30pm EST\nThe Russian mil...",\n\nThe Russian military is reorganizing its...
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1","\n\nMarch 1, 3:00 pm EST\nRussian forces are...","\n\nMarch 1, 3:00 pm EST\nRussian forces are...","\n\nMarch 1, 3:00 pm EST\nRussian forces are...",\n\nRussian forces are completing the reinfo...
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2","\n\nMarch 2, 4:30 pm EST\nRussian forces res...","\n\nMarch 2, 4:30 pm EST\nRussian forces res...","\n\nMarch 2, 4:30 pm EST\nRussian forces res...",\n\nRussian forces resumed offensive operati...
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n..."
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n...","\n\nwith the Critical Threats Project, AEI\n..."
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...","\n\nMarch 5, 2022\nThe early announcement of...","\n\nMarch 5, 2022\nThe early announcement of...","\n\nMarch 5, 2022\nThe early announcement of...","\n\nMarch 5, 2022\nThe early announcement of..."


## Delete unnecessary columns and rename the remaining ones


In [77]:
df_final_raw = df.drop(['main_html', 'main_html_1', 'main_html_2'], axis = 1)
old_to_new = {
    'main_html_3': 'main_text'
}
df_final = df_final_raw.rename(columns = old_to_new)
df_final.head(10)

Unnamed: 0,date,title,text_title,main_text
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,\n\nRussian President Vladimir Putin began a...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step..."
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane..."
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit..."
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",\n\nThe Russian military is reorganizing its...
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",\n\nRussian forces are completing the reinfo...
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2",\n\nRussian forces resumed offensive operati...
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\n..."
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\n..."
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...","\n\nMarch 5, 2022\nThe early announcement of..."


## Rewrite .csv file with final data to store new data frame

In [79]:
df_final.to_csv(f'{output_data}/{output_file}', sep = ';', index = False)

In [81]:
input_data_parsed = 'data/1_raw_isw_data/raw_isw_report.csv'
output_file_parsed = 'preprocessed_isw_report.csv'

## Remove words of one letter

In [100]:
def remove_oneletter_words(data):
    words = word_tokenize(str(data), language="english")
    
    new_text =""
    
    for w in words:
        if len(w)>1:
            new_text= new_text + " " +w 
            
    return new_text 

In [101]:
def convert_low (data):
    return np.char.lower(data)

## Remove stopwords from the text using nltk library with exception for 'no', 'not'

In [102]:
def remove_stopwords(data):
    stop_words=set(stopwords.words('english'))
    not_stopwords ={'no','not'}
    stop_words=stop_words-not_stopwords
    
    words = word_tokenize(str(data), language="english")
    new_text=""
    
    for w in words:
        if w not in stop_words:
            new_text=new_text+" "+w
            
    
    return new_text

## Convert numbers into words with exception for words with numbers inside and ordinal numbers

In [103]:
def convert_numbers(data):
    words = word_tokenize(str(data), language="english")
    
    new_text =""
    for w in words:
        if w.isdigit():
            if int(w)<1000000000:
                w=num2words(w)
            else:
                w=""
                
        new_text= new_text+" " + w
    new_text= np.char.replace(new_text,"-"," ")
    new_text= np.char.replace(new_text,","," ")
    
    return new_text

## Remove symbols and special characters

In [104]:
def remove_signs(data):
    symbols="!\"#$%&()*+-—./:;<=>?@[\]^_'`{|}~\n"
    
    for i in range (len(symbols)):
        data=np.char.replace(data,symbols[i]," ")
        data=np.char.replace(data,"  "," ")
        data=np.char.replace(data,","," ")
    return data

## Stemming and Lemmatization functions

In [105]:
def stemming(data):
    stemmer =PorterStemmer()
    
    words = word_tokenize(str(data), language="english")
    new_text=""
    
    for w in words:
        new_text=new_text+" "+stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer =WordNetLemmatizer()
    
    words = word_tokenize(str(data), language="english")
    new_text=""
    
    for w in words:
        new_text=new_text+" "+lemmatizer.lemmatize(w)
    return new_text

In [106]:
def convert(data):
    data=remove_oneletter_words(data)
    data=convert_low(data)
    data=remove_stopwords(data)
    data=remove_signs(data)
    data=convert_numbers(data)
    data=remove_oneletter_words(data)
    data=stemming(data)
    return data

def convert2(data):
    data=remove_oneletter_words(data)
    data=convert_low(data)
    data=remove_stopwords(data)
    data=remove_signs(data)
    data=convert_numbers(data)
    data=remove_oneletter_words(data)
    data=lemmatizing(data)
    return data

## Convert data frame with raw data using listed functions above and display 5 first rows as an example

In [107]:
withoutletters = convert(page_clean_1)
df_final['text_stemm'] = df_final["main_text"].apply(lambda x: convert(x))
df_final['text_lemm'] = df_final["main_text"].apply(lambda x: convert2(x))
df_final.head(5)

Unnamed: 0,date,title,text_title,main_text,text_stemm,text_lemm
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,\n\nRussian President Vladimir Putin began a...,russian presid vladimir putin began larg scal...,russian president vladimir putin began large ...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenko r...,mason clark george barros kateryna stepanenko...
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane...",mason clark georg barro katya stepanenko russ...,mason clark george barros katya stepanenko ru...
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit...",februari twenti seven 4pm est russian militar...,february twenty seven 4pm est russian militar...
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",\n\nThe Russian military is reorganizing its...,russian militari reorgan militari effort atte...,russian military reorganizing military effort...


## Print 74 row as an example of lemmatization

In [109]:
docs = df_final["text_lemm"].tolist()
some_row = df_final.iloc[74]
some_row_in_html = some_row['text_lemm']
print(some_row_in_html)

 karolina hird kateryna stepanenko mason clark russian force continue face widespread force generation challenge senior u defense official stated may u not observed indicator new major russian mobilization member private military company wagner group urgently requested hundred thousand additional troop reinforce russian effort donbas official noted russia currently ninety seven battalion tactical group btgs ukraine btgs moving ukraine refit resupply suggesting russian troop continue sustain substantial damage combat isw previously assessed russian btgs heavily degraded counting btgs not useful metric russian combat power main ukrainian intelligence directorate gur claimed under trained ill equipped russian conscript still sent active combat despite kremlin denying practice prisoner war bar seven detachment wagner group claimed covert mobilization underway russian send conscript clean damage caused combat self proclaimed donetsk luhansk people republic russian troop ukraine continue dis

## Display number of files to make sure we've not missed any of them

In [110]:
len(docs)

333

## Create instance of count vectorizer to vectorize text data

In [122]:
cv = CountVectorizer(max_df=0.98,min_df=2)
word_count_vector= cv.fit_transform(docs)

word_count_vector.shape

(333, 9191)

In [125]:
with open('data/3_isw_vectorized_data/cv.pkl', 'wb') as f:
    pickle.dump(cv, f)

In [127]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [128]:
with open('data/3_isw_vectorized_data/tfidf_transformer.pkl', 'wb') as f:
    pickle.dump(tfidf_transformer, f)

## Vectorization

In [129]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weight"])
df_idf=df_idf.sort_values(by=["idf_weight"])


df_idf.head(15)

Unnamed: 0,idf_weight
staff,1.021181
attack,1.024244
kharkiv,1.024244
area,1.024244
position,1.027316
combat,1.030397
luhansk,1.030397
main,1.033489
twenty,1.033489
three,1.0397


## Create final data frame which drops unnecessary columns from the previous one

In [133]:
df_final.head(10)

Unnamed: 0,date,title,text_title,main_text,text_stemm,text_lemm
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,\n\nRussian President Vladimir Putin began a...,russian presid vladimir putin began larg scal...,russian president vladimir putin began large ...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenko r...,mason clark george barros kateryna stepanenko...
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"Mason Clark, George Barros, and Katya Stepane...",mason clark georg barro katya stepanenko russ...,mason clark george barros katya stepanenko ru...
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,"\n\nFebruary 27, 4pm EST\n\nThe Russian milit...",februari twenti seven 4pm est russian militar...,february twenty seven 4pm est russian militar...
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",\n\nThe Russian military is reorganizing its...,russian militari reorgan militari effort atte...,russian military reorganizing military effort...
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",\n\nRussian forces are completing the reinfo...,russian forc complet reinforc resuppli troop ...,russian force completing reinforcement resupp...
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2",\n\nRussian forces resumed offensive operati...,russian forc resum offens oper support envelo...,russian force resumed offensive operation sup...
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\n...",critic threat project aei march two thousand ...,critical threat project aei march two thousan...
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\n...",critic threat project aei march two thousand ...,critical threat project aei march two thousan...
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...","\n\nMarch 5, 2022\nThe early announcement of...",march two thousand and twenti two earli annou...,march two thousand and twenty two early annou...


In [134]:
df_final.to_csv(f"data/2_preprocessed_isw_data/{output_file_parsed}", sep = ';', index = False)