In [11]:
import pandas as pd
import re

# Read DataFrame that needs to be tokenized
df = pd.read_csv('cleaned_no_non_latin_words_split_2_Ambra.csv')

def preprocess(text):
    # Make all text lowercase
    text = text.lower()
    # Remove emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F700-\U0001F77F"  # Alchemical Symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)
    # Remove list indices like (a), (b), etc.
    text = re.sub(r'\(\w\)', '', text)
    # Remove single letters except for "I", "A", "U" (and lowercase variants)
    text = re.sub(r'\b(?!(I|A|U|i|a|u)\b)\w\b', '', text)
    # Remove lone letters surrounded by punctuation, except i, o, u, a (case-insensitive)
    text = re.sub(r'\b(?!(i|o|u|a|I|O|U|A)\b)[a-zA-Z]\b(?=\W|\s|$)', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to tokenize text
def tokenize(text):
    # Match acronyms, contractions, words, and punctuation as separate tokens
    return re.findall(r'\b(?:[A-Za-z]\.)+[A-Za-z]\b|(?:\w+\'\w+)|\w+|[^\w\s]', text)

# Apply preprocessing and tokenization
df['cleaned_text'] = df['cleaned_text'].apply(preprocess)  # Apply preprocess to 'cleaned_text' column
df['tokens'] = df['cleaned_text'].apply(tokenize)  # Apply tokenize to 'cleaned_text' column
df.drop(columns=['post'], inplace=True)

# Save the resulting DataFrame to a new CSV
df.to_csv('tokens_non_latin_words_split_2_Ambra.csv', index=False)

In [12]:
# EXPERIMENT 1 -CLEANED TEXT
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load datasets
train_data = pd.read_csv('cleaned_train_split_2_Ambra.csv')
test_data = pd.read_csv('test_data.csv')

# Apply preprocessing to training data
train_data['cleaned_text'] = train_data['cleaned_text'].apply(preprocess)

X_train_texts = train_data['cleaned_text']
y_train_labels = train_data['nationality']
X_test_texts = test_data['post']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(test_data['nationality'])

# Vectorize text data
vectorizer = CountVectorizer(tokenizer=tokenize)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize)

# Linear SVM (without n-grams)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Linear SVM (no n-grams) - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

# Linear SVM with n-grams
X_train_ngram = vectorizer_ngram.fit_transform(X_train_texts)
X_test_ngram = vectorizer_ngram.transform(X_test_texts)

svm.fit(X_train_ngram, y_train)
y_pred_ngram = svm.predict(X_test_ngram)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_ngram, average='weighted')
print("Linear SVM with n-grams - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Linear SVM (no n-grams) - Precision: 0.603572182673168 Recall: 0.5322772639868144 F1 Score: 0.5447215630353948
Linear SVM with n-grams - Precision: 0.6196241573259396 Recall: 0.5357567988279461 F1 Score: 0.5455106361764036


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# EXPERIMENT 2 -NO LATIN WORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load datasets
train_data = pd.read_csv('cleaned_no_non_latin_words_split_2_Ambra.csv')
test_data = pd.read_csv('test_data.csv')

# Apply preprocessing to training data
train_data['cleaned_text'] = train_data['cleaned_text'].apply(preprocess)

X_train_texts = train_data['cleaned_text']
y_train_labels = train_data['nationality']
X_test_texts = test_data['post']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(test_data['nationality'])

# Vectorize text data
vectorizer = CountVectorizer(tokenizer=tokenize)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize)

# Linear SVM (without n-grams)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Linear SVM (no n-grams) - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

# Linear SVM with n-grams
X_train_ngram = vectorizer_ngram.fit_transform(X_train_texts)
X_test_ngram = vectorizer_ngram.transform(X_test_texts)

svm.fit(X_train_ngram, y_train)
y_pred_ngram = svm.predict(X_test_ngram)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_ngram, average='weighted')
print("Linear SVM with n-grams - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Linear SVM (no n-grams) - Precision: 0.5769436556804576 Recall: 0.5014192839483563 F1 Score: 0.5214500573956581
Linear SVM with n-grams - Precision: 0.5973065273258047 Recall: 0.5202820254555444 F1 Score: 0.527169321828692


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# EXPERIMENT 3 - GRAMMAR CORRECTED
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load datasets
train_data = pd.read_csv('cleaned_train_split_2_Ambra_with_grammar_correction.csv')
test_data = pd.read_csv('test_data.csv')

# Apply preprocessing to training data
train_data['grammar_corrected_text'] = train_data['grammar_corrected_text'].apply(preprocess)

X_train_texts = train_data['grammar_corrected_text']
y_train_labels = train_data['nationality']
X_test_texts = test_data['post']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(test_data['nationality'])

# Vectorize text data
vectorizer = CountVectorizer(tokenizer=tokenize)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize)

# Linear SVM (without n-grams)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Linear SVM (no n-grams) - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

# Linear SVM with n-grams
X_train_ngram = vectorizer_ngram.fit_transform(X_train_texts)
X_test_ngram = vectorizer_ngram.transform(X_test_texts)

svm.fit(X_train_ngram, y_train)
y_pred_ngram = svm.predict(X_test_ngram)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_ngram, average='weighted')
print("Linear SVM with n-grams - Precision:", precision, "Recall:", recall, "F1 Score:", f1)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Linear SVM (no n-grams) - Precision: 0.5878477261180246 Recall: 0.5061807526783262 F1 Score: 0.5200153817110377
Linear SVM with n-grams - Precision: 0.6025949121748672 Recall: 0.505356652321216 F1 Score: 0.5175849503170623


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
# EXPERIMENT 4 - ONLY ENGLISH LANGUAGE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Load datasets
train_data = pd.read_csv('cleaned_train_split_2_Ambra_exp4.csv')
test_data = pd.read_csv('test_data.csv')

# Apply preprocessing to training data
train_data['cleaned_text'] = train_data['cleaned_text'].apply(preprocess)

X_train_texts = train_data['cleaned_text']
y_train_labels = train_data['nationality']
X_test_texts = test_data['post']

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train_labels)
y_test = label_encoder.transform(test_data['nationality'])

# Vectorize text data
vectorizer = CountVectorizer(tokenizer=tokenize)
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize)

# Linear SVM (without n-grams)
X_train = vectorizer.fit_transform(X_train_texts)
X_test = vectorizer.transform(X_test_texts)

svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print("Linear SVM (no n-grams) - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

# Linear SVM with n-grams
X_train_ngram = vectorizer_ngram.fit_transform(X_train_texts)
X_test_ngram = vectorizer_ngram.transform(X_test_texts)

svm.fit(X_train_ngram, y_train)
y_pred_ngram = svm.predict(X_test_ngram)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_ngram, average='weighted')
print("Linear SVM with n-grams - Precision:", precision, "Recall:", recall, "F1 Score:", f1)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Linear SVM (no n-grams) - Precision: 0.5992783479896269 Recall: 0.5273326618441535 F1 Score: 0.538973204363394
Linear SVM with n-grams - Precision: 0.6110525330616329 Recall: 0.5282483289076092 F1 Score: 0.5369931953459186


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
