In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

# Load the training data
train_data = pd.read_csv('/kaggle/input/hackathon/beginner_training.csv')

# Preprocess the text data
train_data['clean_text_stopwordsremoved'] = train_data['clean_text_stopwordsremoved'].fillna('')
train_data['viral'] = train_data['viral'].apply(lambda x: 1 if x == 1 else 0)

# Separate input features and target variable
X = train_data[['clean_text_stopwordsremoved', 'month', 'day_of_week', 'hour_of_day', 'display_text_range', 'is_quote_status', 'includes_media', 'number_hashtags', 'sentiment_fulltext', 'sentiment_cleantext']]
y = train_data['viral']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess and vectorize text and categorical features
text_features = 'clean_text_stopwordsremoved'
categorical_features = ['month', 'day_of_week', 'hour_of_day', 'is_quote_status', 'includes_media']
numerical_features = ['display_text_range', 'number_hashtags', 'sentiment_fulltext', 'sentiment_cleantext']

text_transformer = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english'))
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder())
])

numerical_transformer = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_features),
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Update the model to use XGBoost classifier with some initial parameters
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# Define the hyperparameter search space
param_dist = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.1],
    'classifier__max_depth': [3, 5],
    'classifier__min_child_weight': [1, 5],
}

# Perform a RandomizedSearch for the best hyperparameters
random_search = RandomizedSearchCV(model, param_dist, scoring='roc_auc', cv=5, n_iter=10, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
print("Best parameters found: ", random_search.best_params_)
print("Best score found: ", random_search.best_score_)

# Train the model with the best parameters
best_model = random_search.best_estimator_

# Make predictions on the validation set
y_val_pred_proba = best_model.predict_proba(X_val)[:, 1]

# Evaluate the performance of the model on the validation set
val_auc = roc_auc_score(y_val, y_val_pred_proba)
print(f'Validation AUC: {val_auc}')

# Load the test data
test_data = pd.read_csv('/kaggle/input/hackathon/beginner_testing.csv')

# Preprocess the text data
test_data['clean_text_stopwordsremoved'] = test_data['clean_text_stopwordsremoved'].fillna('')

# Preprocess and vectorize test data features
X_test = test_data[['clean_text_stopwordsremoved', 'month', 'day_of_week', 'hour_of_day', 'display_text_range', 'is_quote_status', 'includes_media', 'number_hashtags', 'sentiment_fulltext', 'sentiment_cleantext']]

# Make predictions on the test data
y_test_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Create a submission file with viral value between 0 and 1
submission_df = pd.DataFrame({'tweet_id': test_data['tweet_id'], 'viral': y_test_pred_proba})
submission_df.to_csv('submission20.csv', index=False)
