In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Getting the Data From Eksisozluk

I used https://github.com/coluck/eksisozluk-api for scraping the data from
Eksisozluk. It's based on node.js, and it returns data in a json format.

In [2]:
def entries_to_text(entry_dataframe: pd.DataFrame) -> str:
    """A function to merge all of the entries in a page given by the Eksisozluk
    API. Also handles the removal of HTML tags found in the raw results from
    the API."""
    final_text = ''
    for entry in entry_dataframe['body']:
        soup = BeautifulSoup(entry)
        final_text = ' '.join([final_text, soup.get_text()])
    return final_text

In [3]:
def scan_pages_to_str(start_page: int, end_page: int, header_link: str) -> str:
    """A function that uses Eksisozluk API to process all of the entries in the 
    given page range to a string."""
    text_out = ''
    for i in range(start_page, end_page):
        req = requests.get(f"{header_link}?p={i}")
        entries = pd.DataFrame.from_dict(req.json()["entries"])
        text_out = ' '.join([text_out, entries_to_text(entries)])
    return text_out       

In [4]:
def scan_pages_to_df(start_page: int, end_page: int, header_link: str) -> pd.DataFrame:
    """A function that uses Eksisozluk API to process all of the entries in the 
    given page range to a Pandas Data Frame. Also handles the removal of HTML
    tags found in the raw results from the API."""
    df_out = pd.DataFrame()
    soup = BeautifulSoup()
    for i in range(start_page, end_page):
        req = requests.get(f"{header_link}?p={i}")
        entries = pd.DataFrame(req.json()["entries"])
        df_out = pd.concat([entries, df_out])
    df_out = df_out.reset_index(drop=True)
    df_out["cleaned_body"] = [BeautifulSoup(entry).get_text() for entry in df_out["body"]]
    return df_out

Getting some entries about the main opposition candidate:

In [5]:
#Page 4247 corresponds to entries written on May 1
# Page 4404 ends the night before the election day
kilicdar_text = scan_pages_to_df(4247, 4404, 'http://localhost:3000/api/baslik/kemal-kilicdaroglu--1267550')

  df_out["cleaned_body"] = [BeautifulSoup(entry).get_text() for entry in df_out["body"]]


# Fine-Tuned BERTurk Based Model

In [6]:
sent_model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
sent_tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
sentiment_pipeline = pipeline("sentiment-analysis", tokenizer=sent_tokenizer, model=sent_model)

Filtering out entries that produce more than 512 tokens.

In [7]:
sent_tokenizer(kilicdar_text["cleaned_body"][0])

{'input_ids': [2, 20562, 7362, 2623, 5124, 7065, 16, 2345, 2965, 27870, 1031, 3236, 1992, 18392, 16516, 2637, 2811, 31461, 5179, 6664, 5693, 5621, 18, 6, 6271, 16, 2684, 18756, 2107, 23210, 1987, 3596, 2061, 2287, 14845, 5558, 2023, 29907, 16847, 2984, 1022, 4602, 5244, 2016, 18, 2536, 2051, 24403, 6682, 1029, 1976, 2337, 2042, 22409, 8175, 1975, 2678, 1992, 6975, 6975, 22552, 16531, 2011, 4602, 5244, 2110, 2339, 19950, 2020, 29985, 5224, 2085, 7600, 2061, 18, 57, 10855, 22780, 3117, 3439, 2631, 26746, 22565, 16, 4783, 4394, 1013, 16, 9456, 16282, 4258, 2684, 2866, 4670, 9929, 5304, 5027, 18, 6, 8731, 25895, 6629, 2637, 70, 4420, 18328, 2468, 10463, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [8]:
kilicdar_clean = kilicdar_text.copy()

In [9]:
for i in range(len(kilicdar_clean["cleaned_body"])):
    if len(sent_tokenizer(kilicdar_clean["cleaned_body"][i])["input_ids"]) > 512:
        kilicdar_clean.drop(i, inplace=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (3088 > 512). Running this sequence through the model will result in indexing errors


In [10]:
sentiment_pipeline(list(kilicdar_clean["cleaned_body"])[:2])

[{'label': 'positive', 'score': 0.7617455124855042},
 {'label': 'positive', 'score': 0.9610978364944458}]

In [11]:
kilicdar_clean_sentiments_small = kilicdar_clean.iloc[:5].copy()
kilicdar_clean_sentiments_small["sentiment_data"] = [sentiment_pipeline(list(kilicdar_clean_sentiments_small["cleaned_body"])[i])[0] for i in range(len(kilicdar_clean_sentiments_small))]
kilicdar_clean_sentiments_small

Unnamed: 0,id,body,author,author_id,fav_count,created_at,updated_at,cleaned_body,sentiment_data
0,152322598,"açıkçası sadece tatlı dili, uzlaşmacı yanı ve ...",okhako,2245868,2,13.05.2023 22:55,13.05.2023 22:56,"açıkçası sadece tatlı dili, uzlaşmacı yanı ve ...","{'label': 'positive', 'score': 0.7617455124855..."
1,152322891,bu akşam anadolunun şirin bir ilçesinde<sup cl...,karate kamil,566889,0,13.05.2023 23:00,14.05.2023 00:18,bu akşam anadolunun şirin bir ilçesinde* hamam...,"{'label': 'positive', 'score': 0.9610978364944..."
2,152323083,bu akşam kurmaylarına şu sözü söylediğine emin...,konsensus1,2057317,0,13.05.2023 23:03,13.05.2023 23:10,bu akşam kurmaylarına şu sözü söylediğine emin...,"{'label': 'positive', 'score': 0.5693134069442..."
3,152323096,yarın 13. cumhurbaşkanı olacak,ekcord,2189546,2,13.05.2023 23:03,,yarın 13. cumhurbaşkanı olacak,"{'label': 'positive', 'score': 0.9607189893722..."
4,152323206,seçimi kazanması halinde mafyatik rakipleri il...,gaditano,1460908,0,13.05.2023 23:05,,seçimi kazanması halinde mafyatik rakipleri il...,"{'label': 'negative', 'score': 0.6322678327560..."


In [12]:
kilicdar_clean_sentiments_small["sentiment_data"][0]

{'label': 'positive', 'score': 0.7617455124855042}

In [13]:
kilicdar_clean_sentiments = kilicdar_clean.copy()
kilicdar_clean_sentiments["sentiment_data"] = [sentiment_pipeline(list(kilicdar_clean_sentiments["cleaned_body"])[i])[0] for i in range(len(kilicdar_clean_sentiments))]

In [14]:
kilicdar_clean_sentiments = kilicdar_clean_sentiments.reset_index(drop=True)

In [16]:
def overall_sentiment(df: pd.DataFrame) -> int:
    sentiment = 0
    for i in range(len(df)):
        sentiment += df["sentiment_data"][i]["score"] if df["sentiment_data"][i]["label"] == "positive" else -df["sentiment_data"][i]["score"]
    return sentiment

In [17]:
overall_sentiment(kilicdar_clean_sentiments)

-52.535507559776306