In [6]:
import pandas as pd
import os

# Load Dataset and Select Columns

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
path = "/content/drive/My Drive/NLP_Project"
dataset_path = path+'/tweets.csv'

In [11]:
# Specify columns you want to use (0-indexed)
columns_to_use = [0, 1, 2, 3]
tweets_df = pd.read_csv(dataset_path, usecols=columns_to_use)

In [12]:
print(tweets_df.head())

                        date  \
0  2023-02-21 03:30:04+00:00   
1  2023-02-21 03:29:07+00:00   
2  2023-02-21 03:29:04+00:00   
3  2023-02-21 03:28:06+00:00   
4  2023-02-21 03:27:38+00:00   

                                             content  \
0  तुर्की में सोमवार देर रात भूंकप के तेज झटके मह...   
1                                    New search &amp   
2  Can't imagine those who still haven't recovere...   
3  its a highkey sign for all of us to ponder ove...   
4  Turkiye Earthquake: तुर्किए में फिर आया भूकंप ...   

                                            hashtags language  
0  ['ATDigital', 'Turkey', 'Earthquake', 'TurkeyE...       hi  
1   rescue work is in progress in #Hatay after tw...     True  
2   lost a lot but now having to face another dis...    False  
3    ['turkeyearthquake2023', 'earthquake', 'Syria']       en  
4  ['turkey', 'earthquake', 'turkiye', 'india', '...      und  


# Data Preprocessing

**Get Turkish tweets after the event for the first 24 hours**

In [13]:
# Get only Turkish results
tweets_df = tweets_df[tweets_df['language'] == 'tr']

# Convert 'date' column to datetime type if it is not already
tweets_df['date'] = pd.to_datetime(tweets_df['date'])

start_date = pd.to_datetime('2023-02-06 04:00:00').tz_localize('Etc/GMT-3')
end_date = pd.to_datetime('2023-02-07 04:00:00').tz_localize('Etc/GMT-3')
tweets_df = tweets_df[(tweets_df['date'] >= start_date) & (tweets_df['date'] <= end_date)]

tweets_df.dropna(inplace=True)

tweets_df['hashtags'] = tweets_df['hashtags'].str.lower()
tweets_df = tweets_df[tweets_df['hashtags'].apply(lambda tags: 'ahbap' in tags or 'afad' in tags)]

print(tweets_df)

                            date  \
300937 2023-02-07 00:59:49+00:00   
300950 2023-02-07 00:59:35+00:00   
300961 2023-02-07 00:59:26+00:00   
300972 2023-02-07 00:59:05+00:00   
301018 2023-02-07 00:58:02+00:00   
...                          ...   
476649 2023-02-06 02:11:56+00:00   
476675 2023-02-06 02:11:06+00:00   
476687 2023-02-06 02:10:37+00:00   
477299 2023-02-06 01:51:18+00:00   
477541 2023-02-06 01:42:41+00:00   

                                                  content  \
300937  ENKAZ ALTINDA Kahramanmaraş’ta oturan arkadaşı...   
300950  HATAYA YARDIM GİTMİYO GİTSE DE YETMİYOR HATAY ...   
300961  @nocontextElb KUMBET MAH, KUZEY SOKAK, SAHINLE...   
300972  ENKAZ ALTINDA Kahramanmaraş’ta oturan arkadaşı...   
301018  ACİL samandağına hala destek verilmediği insan...   
...                                                   ...   
476649      Çok Geçmiş Olsun 🙏🏻 #Afad #Deprem #earthquake   
476675  İçeride insanlar var dışarı çıkamıyorlar yardı...   
476687  Gaziantep 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tweets_df['hashtags'] = tweets_df['hashtags'].str.lower()


**Install Libraries**

In [14]:
!pip install pandas spacy nltk zemberek-python
!pip install git+https://github.com/emres/turkish-deasciifier.git
!pip install tqdm

Collecting git+https://github.com/emres/turkish-deasciifier.git
  Cloning https://github.com/emres/turkish-deasciifier.git to /tmp/pip-req-build-2fya30d9
  Running command git clone --filter=blob:none --quiet https://github.com/emres/turkish-deasciifier.git /tmp/pip-req-build-2fya30d9
  Resolved https://github.com/emres/turkish-deasciifier.git to commit 665154c734b09485c3d11ce0038cd121dd109594
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [15]:
import re
import spacy
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from zemberek import TurkishSentenceNormalizer, TurkishMorphology
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from turkish.deasciifier import Deasciifier
from tqdm import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
# Initialize NLP tools
turkish_stopwords = stopwords.words('turkish')
morphology = TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 11.236894607543945


2024-08-01 19:35:16,758 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 11.236894607543945



In [17]:
# Function to preprocess a single tweet
def preprocess_tweet(tweet):
    # Remove consecutive repetitive letters (more than two)
    tweet = re.sub(r'(.)\1{2,}', r'\1\1', tweet)

    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)

    # Remove punctuation marks
    tweet = re.sub(r'[^\w\s]', '', tweet)

    # Remove hashtags and mentions
    tweet = re.sub(r'[@#][^\s]+', '', tweet)

    # Remove additional white spaces
    tweet = re.sub(r'\s+', ' ', tweet)

    # Normalize sentence to avoid noisy text
    tweet = normalizer.normalize(tweet)

    # THIS PART NOT USED BECAUSE NORMALIZER ALREADY DOES THESE OPERATIONS

    # # Convert to lowercase
    # tweet = tweet.lower()

    # # Deascify
    # deasciifier = Deasciifier(tweet)
    # tweet = deasciifier.convert_to_turkish()

    # # Remove stop words
    # tokens = word_tokenize(tweet)
    # tweet = ' '.join(word for word in tokens if word not in turkish_stopwords)



    return tweet


In [None]:
# Apply preprocessing to each tweet in the DataFrame
tweets_df['content'] = tweets_df['content'].progress_apply(preprocess_tweet)

# Save the cleaned DataFrame to a CSV file
tweets_df.to_csv('cleaned_tweets.csv', index=False)

print("Data preprocessing completed and saved to 'cleaned_tweets.csv'")

# Tweet Classification

In [11]:
# Install necessary packages
!pip install transformers==4.30.2 pandas==1.5.3 torch==2.0.1 pyarrow==14.0.1 fsspec==2024.6.1 requests==2.31.0 datasets==2.12.0
!pip install evaluate

Collecting transformers==4.30.2
  Downloading transformers-4.30.2-py3-none-any.whl.metadata (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m112.6/113.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pandas==1.5.3
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting pyarrow==14.0.1
  Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting datasets==2.12.0
  Downloading datasets-2.12.0-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.2)
  Downloading token

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [2]:
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
from tqdm.auto import tqdm

In [3]:
# Function to load and preprocess the dataset
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    print("Original columns:", df.columns)
    df = df.drop(['Unnamed: 0'], axis=1)
    df = df.rename(columns={'Tweets': 'text', 'Class': 'label'})
    print("Renamed columns:", df.columns)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    return train_df, val_df

# Function to tokenize the dataset
def tokenize_dataset(dataset, tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    return tokenized_dataset.remove_columns(["text"]).rename_column("label", "labels")

In [8]:
# Load and preprocess data
train_df, val_df = load_and_preprocess_data(path+'/labelled_data.csv')
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Initialize tokenizer and preprocess datasets
tokenizer = AutoTokenizer.from_pretrained("loodos/bert-base-turkish-uncased")
train_dataset = tokenize_dataset(train_dataset, tokenizer)
val_dataset = tokenize_dataset(val_dataset, tokenizer)

# Try to load the model
try:
    model = AutoModelForSequenceClassification.from_pretrained("loodos/bert-base-turkish-uncased", num_labels=2)
except Exception as e:
    print("Error loading model:", e)
    model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=2)


Original columns: Index(['Unnamed: 0', 'Tweets', 'Class'], dtype='object')
Renamed columns: Index(['text', 'label'], dtype='object')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Some weights of the model checkpoint at loodos/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [9]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    gradient_accumulation_steps=2
)

# Define compute metrics function
def compute_metrics(p):
    metric = evaluate.load("accuracy")
    return metric.compute(predictions=p.predictions.argmax(axis=-1), references=p.label_ids)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Save the model
model.save_pretrained("./model")
tokenizer.save_pretrained("./tokenizer")



Epoch,Training Loss,Validation Loss,Accuracy
1,0.0726,0.118024,0.97


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Evaluation results: {'eval_loss': 0.11802434176206589, 'eval_accuracy': 0.97, 'eval_runtime': 6.7851, 'eval_samples_per_second': 29.476, 'eval_steps_per_second': 7.369, 'epoch': 1.0}


('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [10]:
# Load the tokenizer and model
model = AutoModelForSequenceClassification.from_pretrained("./model")
tokenizer = AutoTokenizer.from_pretrained("./tokenizer")


In [19]:
cleaned_dataset_path = path+'/cleaned_tweets.csv'
tweets_df = pd.read_csv(cleaned_dataset_path)

print(tweets_df)

                            date  \
0      2023-02-07 00:59:49+00:00   
1      2023-02-07 00:59:35+00:00   
2      2023-02-07 00:59:26+00:00   
3      2023-02-07 00:59:05+00:00   
4      2023-02-07 00:58:02+00:00   
...                          ...   
13058  2023-02-06 02:11:56+00:00   
13059  2023-02-06 02:11:06+00:00   
13060  2023-02-06 02:10:37+00:00   
13061  2023-02-06 01:51:18+00:00   
13062  2023-02-06 01:42:41+00:00   

                                                 content  \
0      enkaz altında kahramanmaraşta oturan arkadaşım...   
1      hataya yardım gitmiyor gitse de yetmiyor hatay...   
2      nocontextelb kümbet mah kuzey sokak şahinler a...   
3      enkaz altında kahramanmaraşta oturan arkadaşım...   
4      acil samandağına hala destek verilmediği insan...   
...                                                  ...   
13058            çok geçmiş olsun afad deprem earthquake   
13059  içeride insanlar var dışarı çıkamıyorlar yardı...   
13060  gaziantep deprem anı

# NER Model ()

In [36]:
ner_dataset_path = path+'/ner_tweets.csv'
tweets_df = pd.read_csv(ner_dataset_path)

print(tweets_df)

        source   content     tag
0            0     [CLS]  B-CITY
1            0     enkaz   OTHER
2            0   altında   OTHER
3            0  kahraman  B-CITY
4            0   ##maraş  I-CITY
...        ...       ...     ...
212237    3993      turk   OTHER
212238    3993      ##ey   OTHER
212239    3993       adı  B-CITY
212240    3993   ##yaman  I-CITY
212241    3993     [SEP]   OTHER

[212242 rows x 3 columns]
