In [None]:
!pip install beautifulsoup4 nltk scikit-learn num2words

In [41]:
import pandas as pd
import glob
import csv
import re
import nltk
import numpy as np
import pickle
import calendar
import string

# nltk.download('all')
# nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords as sw
from num2words import num2words
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from datetime import datetime

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

In [109]:
input_data = 'data/0_isw_reports'
output_data = 'data/1_raw_isw_data'
output_file = 'raw_isw_report.csv'


files = glob.glob(f'{input_data}\*.html')

all_data = []

for file in files:
    
    d = {}

    fname = file.replace(f'{input_data}\\', '').replace('.html', '')
    date = datetime.strptime(fname, '%Y-%m-%d').date()
    
    with open (file, "r") as current_file:
        parsed_html = BeautifulSoup(current_file.read())
        title = parsed_html.head.find('title').text
        link = parsed_html.head.find('link', attrs ={'rel':'canonical'}, href=True).attrs["href"]
        
        text_title =  parsed_html.body.find ("h1", attrs={'id':'page-title'}).text
        text_main = parsed_html.body.find ("div", attrs={'class':'field field-name-body field-type-text-with-summary field-label-hidden'})
        
        remove_a_tags(text_main)
        remove_images(text_main)
        remove_names(text_main)
        
        d = {
            "date":date,
            "title":title,
            "text_title":text_title,
            "main_html": text_main.text
        }
        
        all_data.append(d)

In [107]:
def remove_a_tags(soup):
    for p in soup.find_all('p'):
        if p.find_all('a', recursive=True):
            p.extract()

In [108]:
def remove_images(soup):
    for img in soup.find_all('img'):
        img.extract()

In [106]:
def remove_names(soup):
    initials = [soup.select('p:nth-of-type(1)'), soup.select('p:nth-of-type(2)')]
    
    for i in range(3, 5):
        if all(child.name == 'strong' for child in soup.select(f'p:nth-of-type({i})')[0].children):
            initials.append(soup.select(f'p:nth-of-type({i})'))
        
    for i in range(len(initials)):
        initials[i][0].replaceWith('')

In [111]:
df = pd.DataFrame.from_dict(all_data)
df = df.sort_values(by = ['date'])
df.head(10)

Unnamed: 0,date,title,text_title,main_html
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,\n\n\nRussian President Vladimir Putin began ...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\nRussian forces entered major Ukrainian ci...
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\nRussian forces’ main axes of advance in t...
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\n\n\nThe Russian military has likely recog...
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",\n\n\nThe Russian military is reorganizing it...
5,2022-03-01,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",\n\n\nRussian forces are completing the reinf...
6,2022-03-02,"Russian Offensive Campaign Assessment, March 2...","Russian Offensive Campaign Assessment, March 2",\n\n\nRussian forces resumed offensive operat...
7,2022-03-03,Ukraine Conflict Update 14 | Institute for the...,Ukraine Conflict Update 14,"\n\nwith the Critical Threats Project, AEI\nM..."
8,2022-03-04,Ukraine Conflict Update 15 | Institute for the...,Ukraine Conflict Update 15,"\n\nwith the Critical Threats Project, AEI\nM..."
9,2022-03-05,"Explainer on Russian Conscription, Reserve, an...","Explainer on Russian Conscription, Reserve, an...",\n\n\nThe early announcement of the 2022 spri...


In [112]:
df.to_csv(f"{output_data}/{output_file}", sep = ';', index = False) 

## Delete all [n], \n, and \xa0 instances

In [113]:
def remove_newlines_and_url_numbers(text):
    text = re.sub('\n|\xa0|\[\d+\]', '', text)
    return text

## Remove URLs

In [134]:
def remove_urls(text):
    pattern = '(http|ftp|https|ttps):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    text = re.sub(pattern, '', text)
    return text

## Delete all dates


In [114]:
def remove_dates(text):
    months = []
    for i in range(1, 13):
        months.append(calendar.month_name[i])
        
    pattern = "(" + "|".join(months) + ")" + r" \d+( and \d*)?"
    text = re.sub(pattern, '', text)
    return text

## Remove one letter words

In [115]:
def remove_1_letter_words(text):
    words = word_tokenize(text, language="english")
    new_text = []
    
    for w in words:
        new_text.append(w if len(w) > 1 else '')
            
    return ' '.join(new_text) 

## Remove stopwords from the text using nltk library with exception for 'no', 'not'

In [116]:
def remove_stopwords(text):
    stopwords = set(sw.words('english'))
    exclude = {'no','not'}
    stopwords -= exclude
    
    words = word_tokenize(text, language="english")
    new_text = []
    
    for w in words:
        new_text.append(w if w not in stopwords else '')
        
    return ' '.join(new_text)

## Convert numbers into words with exception for words with numbers inside and ordinal numbers

In [117]:
def num_to_word(text):
    words = word_tokenize(text, language="english")
    pattern = r'^-?\d+(?:\.\d+)?$'
    new_text = []
    for w in words: 
        new_text.append(num2words(w) if re.fullmatch(pattern, w) else w)
    new_text = ' '.join(new_text)
    
    
    words = word_tokenize(new_text, language="english")
    new_text = []
    for w in words: 
        new_text.append(num2words(w) if w.isnumeric() or w.isdigit() else w)
        
    return ' '.join(new_text)

## Remove all punctuation

In [118]:
def remove_punctuation(text):
    symbols = string.punctuation
    
    for symbol in symbols:
        text = text.replace(symbol, ' ')
        
    text = text.replace('  ', ' ')
    return text

## Stemming and Lemmatization functions

In [119]:
def stem(data):
    stemmer = PorterStemmer()
    
    words = word_tokenize(data, language="english")
    new_text = []
    
    for w in words:
        new_text.append(stemmer.stem(w))
    return ' '.join(new_text)

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    
    words = word_tokenize(text, language="english")
    new_text = []
    
    for w in words:
        new_text.append(lemmatizer.lemmatize(w))
    
    return ' '.join(new_text)

In [135]:
def convert(text, method='lemm'):
    text = remove_newlines_and_url_numbers(text)
    text = remove_urls(text)
    text = remove_dates(text)
    text = remove_1_letter_words(text)
    text = text.lower()
    text = remove_stopwords(text)
    text = num_to_word(text)
    text = remove_punctuation(text)
    text = lemmatize(text) if method == 'lemm' else stem(text)
    return text

## Convert data frame with raw data using listed functions above and display 5 first rows as an example

In [136]:
df['text_stem'] = df["main_html"].apply(lambda x: convert(x, 'stem'))
df['text_lemm'] = df["main_html"].apply(lambda x: convert(x, 'lemm'))
df.head(5)

Unnamed: 0,date,title,text_title,main_html,text_stem,text_lemm
0,2022-02-24,Russia-Ukraine Warning Update: Initial Russian...,Russia-Ukraine Warning Update: Initial Russian...,\n\n\nRussian President Vladimir Putin began ...,russian presid vladimir putin began larg scale...,russian president vladimir putin began large s...
1,2022-02-25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\nRussian forces entered major Ukrainian ci...,russian forc enter major ukrainian cities—incl...,russian force entered major ukrainian cities—i...
2,2022-02-26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\nRussian forces’ main axes of advance in t...,russian forc main axe advanc last twenti four ...,russian force main ax advance last twenty four...
3,2022-02-27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,\n\n\n\nThe Russian military has likely recog...,russian militari like recogn initi expect limi...,russian military likely recognized initial exp...
4,2022-02-28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",\n\n\nThe Russian military is reorganizing it...,russian militari reorgan militari effort attem...,russian military reorganizing military effort ...


## Print 74 row as an example of lemmatization

In [137]:
docs = df["text_lemm"].tolist()
some_row = df.iloc[74]
some_row_in_html = some_row['text_lemm']
some_row_in_html

'russian force continue face widespread force generation challenge a senior u defense official stated u not observed indicator new major russian mobilization member private military company wagner group urgently requested hundred thousand additional troop reinforce russian effort donbas the official noted russia currently ninety seven battalion tactical group btgs ukraine btgs moving ukraine refit resupply suggesting russian troop continue sustain substantial damage combat isw previously assessed russian btgs heavily degraded counting btgs not useful metric russian combat power the main ukrainian intelligence directorate gur claimed under trained ill equipped russian conscript still sent active combat despite kremlin denying practice a prisoner war bar 7 detachment wagner group claimed covert mobilization underway russian send conscript clean damage caused combat self proclaimed donetsk luhansk people republic russian troop ukraine continue display low morale poor discipline fighting m

## Create instance of count vectorizer to vectorize text data

In [138]:
cv = CountVectorizer(max_df=0.98,min_df=2)
word_count_vector= cv.fit_transform(docs)

word_count_vector.shape

(333, 9343)

In [139]:
with open('data/3_isw_vectorized_data/cv.pkl', 'wb') as f:
    pickle.dump(cv, f)

In [140]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

In [141]:
with open('data/3_isw_vectorized_data/tfidf_transformer.pkl', 'wb') as f:
    pickle.dump(tfidf_transformer, f)

## Vectorization

In [142]:
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(), columns=["idf_weight"])
df_idf=df_idf.sort_values(by=["idf_weight"])


df_idf.head(15)

Unnamed: 0,idf_weight
staff,1.021181
area,1.024244
kharkiv,1.024244
attack,1.024244
position,1.027316
combat,1.030397
luhansk,1.030397
main,1.033489
medium,1.0397
around,1.0397


## Create final data frame which drops unnecessary columns from the previous one

In [148]:
output_file_parsed = 'preprocessed_isw_report.csv'

In [145]:
df_final = df.drop(columns={'title', 'text_title', 'main_html', 'text_stem'})
df_final.head(10)

Unnamed: 0,date,text_lemm
0,2022-02-24,russian president vladimir putin began large s...
1,2022-02-25,russian force entered major ukrainian cities—i...
2,2022-02-26,russian force main ax advance last twenty four...
3,2022-02-27,russian military likely recognized initial exp...
4,2022-02-28,russian military reorganizing military effort ...
5,2022-03-01,russian force completing reinforcement resuppl...
6,2022-03-02,russian force resumed offensive operation supp...
7,2022-03-03,critical threat project aei 2022this daily syn...
8,2022-03-04,critical threat project aei 2022this daily syn...
9,2022-03-05,early announcement two thousand and twenty two...


In [149]:
df_final.to_csv(f"data/2_preprocessed_isw_data/{output_file_parsed}", sep = ';', index = False)