In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords

Using Tweets column for theme classification

In [2]:
df = pd.read_csv('tweets.csv')

In [3]:
df.head()

Unnamed: 0,name,username,description,location,followers,numberstatuses,time,tweets
0,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:07,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:27,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...
2,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:29,ENGLISH TRANSLATION: FIRST AUDIO MEETING WITH ...
3,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:37,ENGLISH TRANSLATION: SHEIKH NASIR AL WUHAYSHI ...
4,GunsandCoffee,GunsandCoffee70,ENGLISH TRANSLATIONS: http://t.co/QLdJ0ftews,,640,49,1/6/2015 21:45,ENGLISH TRANSLATION: AQAP: 'RESPONSE TO SHEIKH...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17410 entries, 0 to 17409
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   name            17410 non-null  object
 1   username        17410 non-null  object
 2   description     14728 non-null  object
 3   location        11432 non-null  object
 4   followers       17410 non-null  int64 
 5   numberstatuses  17410 non-null  int64 
 6   time            17410 non-null  object
 7   tweets          17410 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.1+ MB


In [5]:
df.isnull().sum()

Unnamed: 0,0
name,0
username,0
description,2682
location,5978
followers,0
numberstatuses,0
time,0
tweets,0


As we can see, only description and location are null. But neither of these columns have any significance to us. So, we can just ignore them.

Since there are a lot of random values in this column, and a few are missing. I'm decideing to drop the column.

In [6]:
df.describe()

Unnamed: 0,followers,numberstatuses
count,17410.0,17410.0
mean,3975.07668,4760.787651
std,8674.378495,7015.967275
min,16.0,1.0
25%,266.0,207.0
50%,928.0,908.0
75%,1791.0,6865.0
max,34692.0,33091.0


Only followers and numberstatues columns are numerical. All the other columns are categorical

In [7]:
ace = df['location'].dropna().unique()
ace[:20]

array(['Islamic State', 'München, Deutschland', "Male'. Maldives.",
       'Dunya', 'Wilayat Hadramaut', 'EU', 'Among The Muslims', '.',
       'S.Wazirstan|Mahsud not a Wazir', 'England, United Kingdom',
       'yamin, yasār raqum ١٤', 'أسير الدنيا', 'Read my blog',
       'Antas, Bahia', 'world', 'Wilayah Twitter',
       'اُمتِ مُسلمہ ولایت کشمیر', 'United States',
       '28th Street, Qamar Precint', "Don't need to know"], dtype=object)

In [8]:
df = df.drop(columns=['location', 'description'])
df.head(2)

Unnamed: 0,name,username,followers,numberstatuses,time,tweets
0,GunsandCoffee,GunsandCoffee70,640,49,1/6/2015 21:07,ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...
1,GunsandCoffee,GunsandCoffee70,640,49,1/6/2015 21:27,ENGLISH TRANSLATION: SHEIKH FATIH AL JAWLANI '...


In [9]:
# Step 1: Convert the 'time' column to a standardized datetime format
df['time'] = pd.to_datetime(df['time'], errors='coerce')
# Step 2: Remove duplicate rows
df = df.drop_duplicates()

print(df.info())
print(df.head(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17410 entries, 0 to 17409
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   name            17410 non-null  object        
 1   username        17410 non-null  object        
 2   followers       17410 non-null  int64         
 3   numberstatuses  17410 non-null  int64         
 4   time            17410 non-null  datetime64[ns]
 5   tweets          17410 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 816.2+ KB
None
            name         username  followers  numberstatuses  \
0  GunsandCoffee  GunsandCoffee70        640              49   
1  GunsandCoffee  GunsandCoffee70        640              49   

                 time                                             tweets  
0 2015-01-06 21:07:00  ENGLISH TRANSLATION: 'A MESSAGE TO THE TRUTHFU...  
1 2015-01-06 21:27:00  ENGLISH TRANSLATION: SHEIKH FATIH 

In [10]:
# Ensure NLTK stopwords are available
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
# Define the stopwords list
stop_words = set(stopwords.words('english'))

In [12]:
# Function to clean the tweets
def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove mentions and hashtags
    tweet = re.sub(r"@\w+|#\w+", '', tweet)
    # Remove special characters and numbers
    tweet = re.sub(r"[^a-zA-Z\s]", '', tweet)
    # Normalize to lowercase
    tweet = tweet.lower()
    # Remove stopwords
    tweet = ' '.join(word for word in tweet.split() if word not in stop_words)
    return tweet

In [13]:
# Apply cleaning function to the 'tweets' column
df['tweets'] = df['tweets'].apply(clean_tweet)

# Display the first few rows of the cleaned tweets column
df[['tweets']].head()

Unnamed: 0,tweets
0,english translation message truthful syria she...
1,english translation sheikh fatih al jawlani pe...
2,english translation first audio meeting sheikh...
3,english translation sheikh nasir al wuhayshi h...
4,english translation aqap response sheikh baghd...


In [14]:
pip install empath

Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57798 sha256=3a03be5e4e04b1d02318f105358b1237f6757b3408a13c1730d4cd973cac18eb
  Stored in directory: /root/.cache/pip/wheels/92/b3/83/9eb2c6199881e2385a59d99bd911363475060ebeb4bdb27242
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


In [15]:
# Import necessary libraries for lemmatization and tokenization
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from empath import Empath
import nltk

In [16]:
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [17]:
# Initialize lemmatizer and Empat
lemmatizer = WordNetLemmatizer()
lexicon = Empath()

In [18]:
# Function for lemmatization and tokenization
def lemmatize_and_tokenize(tweet):
    # Tokenize the tweet
    tokens = word_tokenize(tweet)
    # Lemmatize each token
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# Apply lemmatization and tokenization to the 'tweets' column
df['tokens'] = df['tweets'].apply(lemmatize_and_tokenize)

In [19]:
# Function to categorize tweets using Empath
def categorize_tweet(tweet):
    empath_analysis = lexicon.analyze(tweet, categories=["propaganda", 'weapon', 'terrorism', 'crime', 'religion'], normalize=True)
    if empath_analysis:  # Check if the analysis is valid
        dominant_category = max(empath_analysis, key=empath_analysis.get)
        return dominant_category
    else:
        return "uncategorized"  # Default category for invalid or empty analysis


# Apply Empath categorization to the 'tweets' column
df['category'] = df['tweets'].apply(categorize_tweet)

In [20]:
categories = ["propaganda", "weapon", "terrorism", "crime", "religion"]

# Create a dictionary to store value counts for each category
category_counts = {category: df[df['category'] == category].shape[0] for category in categories}

# Display the category counts
category_counts

{'propaganda': 12208,
 'weapon': 2178,
 'terrorism': 856,
 'crime': 792,
 'religion': 419}

In [21]:
df.head(5)

Unnamed: 0,name,username,followers,numberstatuses,time,tweets,tokens,category
0,GunsandCoffee,GunsandCoffee70,640,49,2015-01-06 21:07:00,english translation message truthful syria she...,"[english, translation, message, truthful, syri...",propaganda
1,GunsandCoffee,GunsandCoffee70,640,49,2015-01-06 21:27:00,english translation sheikh fatih al jawlani pe...,"[english, translation, sheikh, fatih, al, jawl...",propaganda
2,GunsandCoffee,GunsandCoffee70,640,49,2015-01-06 21:29:00,english translation first audio meeting sheikh...,"[english, translation, first, audio, meeting, ...",propaganda
3,GunsandCoffee,GunsandCoffee70,640,49,2015-01-06 21:37:00,english translation sheikh nasir al wuhayshi h...,"[english, translation, sheikh, nasir, al, wuha...",propaganda
4,GunsandCoffee,GunsandCoffee70,640,49,2015-01-06 21:45:00,english translation aqap response sheikh baghd...,"[english, translation, aqap, response, sheikh,...",propaganda


In [22]:
# Save the updated data to a new CSV file
output_path = "/updated_tweets.csv"
df.to_csv(output_path, index=False)

print(f"Updated data saved to {output_path}")

Updated data saved to /updated_tweets.csv


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
# Ensure there are no missing values in the required columns
df = df.dropna(subset=['tweets', 'category'])

In [None]:
#Convert text to TF-IDF embeddings
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limit to 1000 features for simplicity
X = tfidf_vectorizer.fit_transform(df['tweets'])

In [None]:
# Encode the categories as numerical labels
y = df['category']

In [None]:
#Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Step 4: Predict on the test set
y_pred = model.predict(X_test)

In [None]:
# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

In [None]:
# Display detailed metrics
classification_rep = classification_report(y_test, y_pred, zero_division=0)

metrics = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1
}

print(f"Metrics: {metrics}")
print(f"Classification Report:\n{classification_rep}")

Metrics: {'Accuracy': 0.8701895462377943, 'Precision': 0.8296827701582048, 'Recall': 0.8701895462377943, 'F1-Score': 0.8374510765159967}
Classification Report:
               precision    recall  f1-score   support

        crime       0.94      0.53      0.67       165
   propaganda       0.85      1.00      0.92      2432
     religion       0.96      0.33      0.49        79
    terrorism       0.91      0.61      0.73       175
uncategorized       0.00      0.00      0.00       193
       weapon       0.98      0.86      0.92       438

     accuracy                           0.87      3482
    macro avg       0.77      0.56      0.62      3482
 weighted avg       0.83      0.87      0.84      3482

