In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Load the datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 2. Preprocess the data
# Fill any missing values in the comment column with an empty string to avoid errors
train['comment_text'] = train['comment_text'].fillna(' ')
test['comment_text'] = test['comment_text'].fillna(' ')

# 3. Vectorize the text
# Convert the text comments into numerical vectors using TF-IDF
print("Vectorizing text data...")
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    stop_words='english',
    max_features=20000  # Limits the number of features to the top 20k words
)

# Learn vocabulary from training data and transform both train and test data
word_vectorizer.fit(train['comment_text'])
train_features = word_vectorizer.transform(train['comment_text'])
test_features = word_vectorizer.transform(test['comment_text'])

# 4. Train the Logistic Regression Model
# Since there are multiple target labels, we train a separate model for each label
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
submission = pd.DataFrame.from_dict({'id': test['id']})

print("Training models...")
for class_name in class_names:
    # Select the target column for the current class
    train_target = train[class_name]
    
    # Initialize the Logistic Regression classifier
    # solver='sag' is often faster for large datasets
    classifier = LogisticRegression(solver='sag', max_iter=1000)
    
    # Fit the model
    classifier.fit(train_features, train_target)
    
    # Predict probabilities for the test set (column 1 contains the probability of class 1)
    submission[class_name] = classifier.predict_proba(test_features)[:, 1]
    print(f" - Model for '{class_name}' trained.")

# 5. Output the results
print("\nFirst few predictions:")
print(submission.head())

# Save to CSV if needed
# submission.to_csv('submission.csv', index=False)

Vectorizing text data...
Training models...
 - Model for 'toxic' trained.
 - Model for 'severe_toxic' trained.
 - Model for 'obscene' trained.
 - Model for 'threat' trained.
 - Model for 'insult' trained.
 - Model for 'identity_hate' trained.

First few predictions:
                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.995882      0.158408  0.992690  0.025056  0.873226   
1  0000247867823ef7  0.009457      0.003104  0.005375  0.001681  0.008452   
2  00013b17ad220c46  0.012986      0.001538  0.005948  0.000809  0.009057   
3  00017563c3f7919a  0.003688      0.002232  0.003532  0.001094  0.003935   
4  00017695ad8997eb  0.035640      0.002335  0.008748  0.001732  0.011660   

   identity_hate  
0       0.200461  
1       0.003413  
2       0.002316  
3       0.000952  
4       0.002589  
