In [102]:
import pandas as pd 
import numpy as np
import math
from typing import Dict, Tuple, Set, Optional, List, Union

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

#for computing cosine similarity
from collections import Counter

#for text pre-processing
import re, string

from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#for word embedding
import gensim
from gensim.models import Word2Vec

#plotting data
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [103]:
url = 'https://raw.githubusercontent.com/acikkaynak/depremadres-intent-classification-v0/main/data/8_9_Feb_deprem.csv'
data = pd.read_csv(url, index_col = 1)

url2 = 'https://raw.githubusercontent.com/acikkaynak/depremadres-intent-classification-v0/main/intent-classification-v0/intent_config/GIYSI.txt'
giysi_data = pd.read_csv(url2, header = None)

url3 = 'https://raw.githubusercontent.com/acikkaynak/depremadres-intent-classification-v0/main/intent-classification-v0/intent_config/KURTARMA.txt'
kurtarma_data = pd.read_csv(url3, header = None)

url4 = 'https://raw.githubusercontent.com/acikkaynak/depremadres-intent-classification-v0/main/intent-classification-v0/intent_config/YEMEK-SU.txt'
yemek_data = pd.read_csv(url4, header = None)

In [104]:
#store all the txt. files in lists
giysi_list = giysi_data.iloc[:,0].tolist()
kurtarma_list = kurtarma_data.iloc[:,0].tolist()
yemek_list = yemek_data.iloc[:,0].tolist()

In [105]:
# store full_text column in a list
tweet_list = data['full_text'].tolist()

In [106]:
# preprocess the text to delete URLs and tagged account from full_text column
def preprocess(text_list):
    processed_list = []
    for text in text_list:
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"@\S+", "", text)
        #text = re.sub(r"#\S+", "", text)
        #text = re.sub(r"\n", " ", text)

        processed_list.append(text)
    return processed_list

In [107]:
#create filtered_word_list

def text_cleaned(text_list):
    filtered_words = []
    stop_words = set(stopwords.words('turkish'))
    for i in range(len(text_list)):
        clean_text_list = []
        clean_text_list += text_list[i].split()
        filter = [w for w in clean_text_list if not w in stop_words]
        filtered_words.append(filter)
    
    return filtered_words

In [108]:
#apply the functions of preprocess and text_cleaned
tweet_list_preprocessed = preprocess(tweet_list)
tweet_list_cleaned = text_cleaned(tweet_list_preprocessed)

In [109]:
#set a function for computation similarity between two lists
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))

    #avoid float division by zero
    if magA == 0 or magB==0:
      return 0  
    else:
      return dotprod / (magA * magB)

In [110]:
#creating counter lists
counter_giysi = Counter(giysi_list)
counter_kurtarma = Counter(kurtarma_list)
counter_yemek = Counter(yemek_list)

#computing cosine similarities for all the tweets
similarity_giysi = []
similarity_kurtarma = []
similarity_yemek = []

for index in range(len(tweet_list_cleaned)):
  counter_tweet = Counter(tweet_list_cleaned[index])
  similarity_giysi.append(counter_cosine_similarity(counter_tweet, counter_giysi))
  similarity_kurtarma.append(counter_cosine_similarity(counter_tweet, counter_kurtarma))
  similarity_yemek.append(counter_cosine_similarity(counter_tweet, counter_yemek))

In [111]:
#creating data frame
tweets = []
for index in range(len(tweet_list_cleaned)):
  tweets.append(''.join(tweet_list_cleaned[index]))

similarity_data = {'tweets': tweets,
        'Giysi' : similarity_giysi,
        'Kurtarma': similarity_kurtarma,
        'Yemek': similarity_yemek}

df = pd.DataFrame(similarity_data)
df.head(10)

Unnamed: 0,tweets,Giysi,Kurtarma,Yemek
0,#depremadresZiyaGökalpCd.No:115,0.0,0.0,0.0
1,#depremadres11.Sk.No:50,0.0,0.0,0.0
2,#depremadres302.Sk.,0.0,0.0,0.0
3,#depremadres11.Sk.No:50,0.0,0.0,0.0
4,#depremadresSevimŞİrikçiMeslekiTeknikAnadoluLi...,0.0,0.0,0.0
5,#depremadresCumhuriyetCd.48C,0.0,0.0,0.0
6,#depremadres,0.0,0.0,0.0
7,#depremadresçekmecemahallesiuğurmumcucaddesiye...,0.0,0.073771,0.0
8,#depremadresUmutkent,0.0,0.0,0.0
9,#depremadres,0.0,0.0,0.0
