In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

train_sample_data = train_data.iloc[:900].copy()  
test_sample_data = train_data.iloc[900:].copy()  

label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(max_features=1000)  
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
xgb_model.fit(X_train_tfidf, y_train)

y_pred = xgb_model.predict(X_test_tfidf)

y_pred = y_pred.astype('int64')

# print(f"Unique values in y_test: {set(y_test)}")
# print(f"Unique values in y_pred: {set(y_pred)}")

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

F1 Score: 0.6337


In [3]:
from sklearn.model_selection import RandomizedSearchCV

train_sample_data = train_data.iloc[:900].copy()  
test_sample_data = train_data.iloc[900:].copy()  

label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],  
    'learning_rate': [0.001, 0.01, 0.1, 0.2], 
    'n_estimators': [100, 200, 300, 500], 
    'subsample': [0.7, 0.8, 0.9, 1], 
    'colsample_bytree': [0.7, 0.8, 0.9, 1],  
    'gamma': [0, 0.1, 0.2, 0.3], 
}

random_search = RandomizedSearchCV(
    xgb_model, param_grid, n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1
)

random_search.fit(X_train_tfidf, y_train)

print("Best parameters found: ", random_search.best_params_)

best_xgb_model = random_search.best_estimator_

y_pred = best_xgb_model.predict(X_test_tfidf)
y_pred = y_pred.astype('int64')

f1 = f1_score(y_test, y_pred)
print(f"F1 Score with Best Model: {f1:.4f}")

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found:  {'subsample': 0.7, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.2, 'colsample_bytree': 0.7}
F1 Score with Best Model: 0.6286


In [4]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV

nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

train_sample_data = train_data.iloc[:900].copy()
test_sample_data = train_data.iloc[900:].copy()

label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

train_sample_data['sentence'] = train_sample_data['sentence'].apply(preprocess_text)
test_sample_data['sentence'] = test_sample_data['sentence'].apply(preprocess_text)

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 500],
    'subsample': [0.7, 0.8, 0.9, 1],
    'colsample_bytree': [0.7, 0.8, 0.9, 1],
    'gamma': [0, 0.1, 0.2, 0.3],
}

random_search = RandomizedSearchCV(xgb_model, param_grid, n_iter=10, cv=3, verbose=1, random_state=42, n_jobs=-1)
random_search.fit(X_train_tfidf, y_train)

best_xgb_model = random_search.best_estimator_

y_pred = best_xgb_model.predict(X_test_tfidf)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score with Improved Preprocessing and Hyperparameter Search: {f1:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits
F1 Score with Improved Preprocessing and Hyperparameter Search: 0.6038


In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

train_sample_data = train_data.iloc[:900].copy()
test_sample_data = train_data.iloc[900:].copy()

label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

log_reg_model.fit(X_train_tfidf, y_train)
y_pred = log_reg_model.predict(X_test_tfidf)

f1 = f1_score(y_test, y_pred)
print(f"F1 Score with Logistic Regression: {f1:.4f}")


F1 Score with Logistic Regression: 0.7143


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")

train_sample_data = train_data.iloc[:900].copy()
test_sample_data = train_data.iloc[900:].copy()

label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train_tfidf, y_train)

y_pred = rf_model.predict(X_test_tfidf)

f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score with Random Forest: {f1:.4f}")


F1 Score with Random Forest: 0.6198


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")
train_sample_data = train_data.iloc[:900].copy()
test_sample_data = train_data.iloc[900:].copy()
label_encoder = LabelEncoder()
train_sample_data.loc[:, 'label'] = label_encoder.fit_transform(train_sample_data['label'])
test_sample_data.loc[:, 'label'] = label_encoder.transform(test_sample_data['label'])

X_train = train_sample_data['sentence']
y_train = train_sample_data['label']
X_test = test_sample_data['sentence']
y_test = test_sample_data['label']

y_train = y_train.astype('int64')
y_test = y_test.astype('int64')

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

svm_model = SVC(kernel='linear', random_state=42)

svm_model.fit(X_train_tfidf, y_train)
y_pred = svm_model.predict(X_test_tfidf)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score with SVM: {f1:.4f}")


F1 Score with SVM: 0.6144


In [8]:
train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")
test_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

X_train = train_data['sentence']
y_train = train_data['label']

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)

xgb_model = xgb.XGBClassifier(eval_metric='mlogloss', use_label_encoder=False)
xgb_model.fit(X_train_tfidf, y_train)

X_test = test_data['sentence']
X_test_tfidf = vectorizer.transform(X_test)

y_pred = xgb_model.predict(X_test_tfidf)

y_pred_labels = label_encoder.inverse_transform(y_pred)

submission = pd.DataFrame({
    'ID': test_data['ID'],  
    'label': y_pred_labels  
})

# submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file saved.")

Submission file saved.


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/train.csv")
test_data = pd.read_csv("/kaggle/input/multi-lingual-sentiment-analysis/test.csv")

label_encoder = LabelEncoder()
train_data['label'] = label_encoder.fit_transform(train_data['label'])

X_train = train_data['sentence']
y_train = train_data['label']

vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)

log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train_tfidf, y_train)

X_test = test_data['sentence']
X_test_tfidf = vectorizer.transform(X_test)

y_pred = log_reg_model.predict(X_test_tfidf)
y_pred_labels = label_encoder.inverse_transform(y_pred)

submission = pd.DataFrame({
    'ID': test_data['ID'],  
    'label': y_pred_labels  
})

submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file saved.")


Submission file saved.
