In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('kaggle/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

kaggle/sample_submission.csv
kaggle/test.csv
kaggle/train.csv


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Load the data
train_df = pd.read_csv("kaggle/train.csv")
test_df = pd.read_csv("kaggle/test.csv")

# Separate features and target variable
X_train = train_df["text"]
y_train = train_df["target"]
X_test = test_df["text"]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform on training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# Transform test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize logistic regression classifier
classifier = LogisticRegression()

# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)

# Train the classifier
classifier.fit(X_train_split, y_train_split)
# Predict probabilities on the validation set
val_probabilities = classifier.predict_proba(X_val_split)[:, 1]

# Convert probabilities to binary predictions using a threshold
threshold = 0.5
val_predictions = (val_probabilities > threshold).astype(int)

# Calculate F1 score on the validation set
f1 = f1_score(y_val_split, val_predictions)
print("Validation F1 Score:", f1)

# Make predictions on the test set
test_probabilities = classifier.predict_proba(X_test_tfidf)[:, 1]
test_predictions = (test_probabilities > threshold).astype(int)

Validation F1 Score: 0.7541528239202658


In [3]:
# Save TF-IDF vectorizer and trained model
import joblib
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(classifier, 'classifier.pkl')

['classifier.pkl']

In [4]:
# load the model and vectorizer
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
classifier = joblib.load('classifier.pkl')


In [10]:
# make the predictions using the classifier with user data
user_data = [
    "Just saw a huge fire downtown! #disaster",
    "There's been a major earthquake in California.",
    "Breaking News: Hurricane Florence is heading towards the East Coast.",
    "A tsunami warning has been issued for the Pacific coast.",
    "The building collapsed in the city center. It's a disaster!",
    "There's a wildfire spreading rapidly near the residential area.",
    "New shoes got me shook, feels like an earthquake!",
    "I'm so hungry, it's like a tornado hit my kitchen!",
    "My grades are a disaster right now.",
    "This traffic jam is a total disaster.",
]



user_data_tfidf = tfidf_vectorizer.transform(user_data)
user_predictions = classifier.predict(user_data_tfidf)
print(user_predictions)


[1 1 1 1 1 1 0 0 0 1]
