In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/datamining2024/sample.csv
/kaggle/input/datamining2024/train_target.csv
/kaggle/input/datamining2024/train_data.csv
/kaggle/input/datamining2024/test_data.csv


In [2]:
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy import sparse
from scipy import stats
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, LSTM, Dropout, Dense, Bidirectional
import nltk
from nltk.corpus import stopwords
import re

To solve this problem, we combine predictions from two feature extractions and 3 classifiers, using the stacking techinique. In the first part of the notebook, all data will be split into training and validation sets, then predictions for validations set will be generated by each classifier, before retraining each on the whole dataset. A logistic regression classifier will then be trained on the validation predictions. Our model will extract features from data, feed them to the 3 classifiers, then feed the 3 predictions vectors to the logistic regression, which will output the final classification.

# Load data

In [3]:
#Loading and preparing training data
train_data = pd.read_csv("/kaggle/input/datamining2024/train_data.csv", encoding="utf8")
target = pd.read_csv("/kaggle/input/datamining2024/train_target.csv")
train_data = pd.merge(train_data, target, on="author")
#load test data
test_data = pd.read_csv("/kaggle/input/datamining2024/test_data.csv", encoding="utf8")

# Subreddit feature extraction: tf-idf and sparse matrix

In this first step, we aim to extract features related to subreddit interaction. From the data, a sparse binary matrix is created, mapping interactions, without accounting for frequency. To improve on this, a new matrix is created, where each element is the tf-idf value of a subreddit for an author. The two matrices are then stacked. The parameters for the TfidfVectorizer have been determined by means of cross validation in a separate notebook, to speed up the submission process 

In [4]:
#processing data
unique_authors = train_data['author'].drop_duplicates()
subreddit_data=train_data.groupby('author', sort=False).agg({'subreddit': ' '.join, 'gender':'first'})
#subreddit_data=subreddit_data.set_index('author').loc[unique_authors].reset_index()
print(subreddit_data.head())

                                                       subreddit  gender
author                                                                  
Shamus_Aran    mylittlepony standupshots WritingPrompts consp...       0
Riddance       sex AskReddit AskReddit sex 2007scape AskReddi...       1
Secret_Wizard  DragonsDogma AskReddit gaming darksouls tumblr...       0
Penultimatum   malefashionadvice funny coins coins malefashio...       0
7-SE7EN-7      todayilearned FanTheories FanTheories AskReddi...       0


In [5]:
#creating training and validation sets
features = subreddit_data["subreddit"].values
labels = subreddit_data["gender"].values
train_features, val_features, train_labels, val_labels = train_test_split(features, labels, test_size=0.2, random_state=0)

In [6]:
print(train_labels[:5], val_labels[:5])

[0 0 1 0 1] [1 0 0 1 0]


In [7]:
#tf-idf with parameters determined through 5-fold cross validation
vectorizer=TfidfVectorizer(max_df=0.95, min_df=25)
train_features_tfidf= vectorizer.fit_transform(train_features)
val_features_tfidf= vectorizer.transform(val_features)

In [8]:
#processing data
subreddits = train_data.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=np.arange(subreddits.shape[0]))
def extract_features(group):
    group_subreddits = group['subreddit']
    group_subreddits = group_subreddits[group_subreddits.isin(subreddits_map.index)].values
    idxs = subreddits_map.loc[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = 1
    return v.tocsr()

features_dict = {}

for author, group in train_data.groupby('author', sort=False):
    features_dict[author] = extract_features(group)
    
X = sparse.vstack([features_dict[author] for author in unique_authors])
X_train, X_val = train_test_split(X, test_size=0.2, random_state=0)

In [9]:
training_set=sparse.hstack([train_features_tfidf, X_train])
validation_set=sparse.hstack([val_features_tfidf, X_val])

# MultinomialNB on subreddit features

Model selection for this data has been performed in a separate notebook. Multinomial Naive Bayes has outperformed Random forests and MLPs on this task, so we fit it to the data. To perform model selection, the data was split 80/20 into training and test. To determine hyperparameters for the MultinomialNB, 5 fold cross validation was used.

In [10]:
#generate validation prediction with hyperparameters determined through 5-fold cross validation
multi_clf=MultinomialNB(alpha=0.2803776589210949, fit_prior=True)
multi_clf.fit(training_set, train_labels)
multi_val_pred=multi_clf.predict_proba(validation_set.toarray())[:,1]
print(roc_auc_score(val_labels, multi_val_pred))

0.9249375140849397


In [11]:
#retrain
vectorizer_sub=TfidfVectorizer(max_df=0.95, min_df=25)
all_features_tfidf=vectorizer_sub.fit_transform(features)
full_training_set=sparse.hstack([all_features_tfidf, X])
#MultiNB_clf=MultinomialNB(alpha=0.08936431812289214)
multi_clf.fit(full_training_set, labels)

# DNN on subreddits sparse matrix

During model selection for the previous section, the DNN performed particularly well on the sparse binary matrix alone, so it will be included as an additional classifier. The same remarks made in the previous section hold for model selection and hyperparameters tuning.

In [12]:
y=labels

In [13]:
#preparing data for a DNN
#train/validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
print(y_train[:5], y_val[:5])
BUFFER_SIZE=1000
BATCH_SIZE=256

X_train_dense = X_train.toarray()
X_val_dense = X_val.toarray()
training_set=tf.data.Dataset.from_tensor_slices((X_train_dense,y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_set=tf.data.Dataset.from_tensor_slices((X_val_dense, y_val)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

[0 0 1 0 1] [1 0 0 1 0]


In [14]:
#DNN with predetermined hyperparameters 
DNN = tf.keras.Sequential([
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dropout(0.4),
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])
        
DNN.compile(
    loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)
        
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_auc', patience=2, mode="max", restore_best_weights=True)
history = DNN.fit(
            training_set,
            epochs=5,
            validation_data=val_set,
            callbacks=[early_stopping]
        )

Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.7024 - auc: 0.5562 - loss: 0.5036 - val_accuracy: 0.7340 - val_auc: 0.7353 - val_loss: 0.3632
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7216 - auc: 0.7957 - loss: 0.3369 - val_accuracy: 0.7370 - val_auc: 0.8900 - val_loss: 0.2644
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7683 - auc: 0.9136 - loss: 0.2456 - val_accuracy: 0.8580 - val_auc: 0.9096 - val_loss: 0.2110
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8808 - auc: 0.9429 - loss: 0.1916 - val_accuracy: 0.8560 - val_auc: 0.9126 - val_loss: 0.1828
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.9066 - auc: 0.9605 - loss: 0.1536 - val_accuracy: 0.8600 - val_auc: 0.9131 - val_loss: 0.1657


In [15]:
DNN_val_pred=DNN.predict(val_set.map(lambda features, labels: features))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step


In [16]:
#retrain on the whole dataset
all_data=tf.data.Dataset.from_tensor_slices((X.toarray(),y)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

DNN = tf.keras.Sequential([
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dropout(0.4),
    Dense(100, activation="relu", kernel_regularizer=tf.keras.regularizers.L2(l2=0.001)),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])
        
DNN.compile(
    loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0),
    optimizer=tf.keras.optimizers.Adam(1e-3),
    metrics=["accuracy", tf.keras.metrics.AUC(name="auc")]
)
        
history = DNN.fit(
            all_data,
            epochs=5,
        )

Epoch 1/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.7303 - auc: 0.5685 - loss: 0.4908
Epoch 2/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7544 - auc: 0.8625 - loss: 0.3029
Epoch 3/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7975 - auc: 0.9411 - loss: 0.2161
Epoch 4/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.9264 - auc: 0.9672 - loss: 0.1611
Epoch 5/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9460 - auc: 0.9806 - loss: 0.1192


# Multinomial NB on comments tf-idf

Finally, we use the text of the comments as data, by aggregating all comments for each author and processing the resulting text as a single document. The resulting Tf-idf matrxi will be fed to a Multinomial Naive Bayes classifier, which proved the most effective model once again. As for the previous sections, model selection has been performed on a separate notebook, as well as hyperparameter tuning, for speed of submission and clarity.

In [17]:
#processing data
grouped_data=train_data.groupby('author', sort=False).agg({'body': ' '.join, 'gender':'first'})
grouped_data.head()

Unnamed: 0_level_0,body,gender
author,Unnamed: 1_level_1,Unnamed: 2_level_1
Shamus_Aran,I don't think we'd get nearly as much fanficti...,0
Riddance,"Thanks. I made it up, that's how I got over my...",1
Secret_Wizard,Are you sure you aren't confusing Cyclops (the...,0
Penultimatum,dont do this to me bro I'm missing the joke. W...,0
7-SE7EN-7,That's what we do when we can't find a mate Li...,0


In [18]:
# Ensure NLTK stopwords are downloaded
stop_words = stopwords.words('english')

# Custom function to remove stopwords
def remove_stopwords(text):
    words = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(words)

In [19]:
#creating training and validation sets
features = grouped_data["body"].apply(remove_stopwords).values
labels = grouped_data["gender"].values
#train_features, val_features, train_labels, val_labels = train_test_split(features, labels, test_size=0.2, random_state=0)
#print(train_labels[:5], val_labels[:5])

In [20]:
#train with selected hyperparameters and generate validation prediction
vectorizer_text=TfidfVectorizer(max_features=35000, max_df=0.9)
#train_features_tfidf= vectorizer_text.fit_transform(train_features)
features_tfidf= vectorizer_text.fit_transform(features)
#val_features_tfidf= vectorizer_text.transform(val_features)
train_features_tfidf, val_features_tfidf, train_labels, val_labels = train_test_split(features_tfidf, labels, test_size=0.2, random_state=0)
MultiNB_clf=MultinomialNB(alpha=0.05365580537557646, fit_prior=False)
MultiNB_clf.fit(train_features_tfidf, train_labels)
comment_val_pred=MultiNB_clf.predict_proba(val_features_tfidf.toarray())[:,1]
print(roc_auc_score(val_labels, comment_val_pred))

0.8929186044129397


In [21]:
#retrain
#train_features_tfidf=vectorizer_text.fit_transform(features)
MultiNB_clf.fit(features_tfidf, labels)

# Training LogisticRegression for stacking

Here, the Logistic regression is trained on the validation predictions generated by each classifier.

In [22]:
model_predictions=np.vstack((multi_val_pred, DNN_val_pred.flatten(), comment_val_pred)).T
target=val_labels

In [23]:
print(model_predictions[12:20,:]), print(target[12:20])

[[9.99996612e-01 7.62555361e-01 7.78430155e-01]
 [7.40221376e-06 5.30444756e-02 4.62976317e-01]
 [1.00843856e-01 2.08533287e-01 4.11555534e-02]
 [9.99999952e-01 7.31481791e-01 7.48438835e-01]
 [2.96883508e-06 3.23858559e-02 6.16671185e-02]
 [1.44508541e-04 1.18949853e-01 7.18787072e-01]
 [9.90460464e-01 5.38608313e-01 7.57613669e-01]
 [1.11263201e-03 1.64505675e-01 7.39976349e-02]]
[1 0 0 1 0 0 1 0]


(None, None)

In [24]:
model=LogisticRegression(solver='liblinear', class_weight='balanced')
lr_clf=RandomizedSearchCV(model, {'C':stats.uniform(scale=2), 'penalty':['l1', 'l2'], 'max_iter':[100, 500, 1000, 5000]}, n_iter=100, scoring='roc_auc', cv=10)
search=lr_clf.fit(model_predictions, target)
search.best_params_, search.best_score_

({'C': 0.9658625912369232, 'max_iter': 1000, 'penalty': 'l1'},
 0.941922823772139)

In [25]:
lr_clf=LogisticRegression(C=1.896099716053217, penalty='l1', solver='liblinear', class_weight='balanced', max_iter=500)
lr_clf.fit(model_predictions, target)

In [26]:
print(lr_clf.predict_proba(model_predictions)[:,1][:6]), print(target[:6])

[0.83394613 0.02125957 0.33877705 0.16834292 0.1011206  0.15756293]
[1 0 0 1 0 0]


(None, None)

# Test predictions

Finally, we process the test data and generate final predictions.

In [27]:
#processing test data for multinomial classifier on subreddits
unique_authors_test = test_data['author'].drop_duplicates()
subreddit_data_test=test_data.groupby('author', as_index=False).agg({'subreddit': ' '.join})
subreddit_data_test=subreddit_data_test.set_index('author').loc[unique_authors_test].reset_index()
print(subreddit_data_test.head())

          author                                          subreddit
0  ejchristian86  TwoXChromosomes harrypotter RedditLaqueristas ...
1      ZenDragon  gaming explainlikeimfive Nexus7 WTF gadgets ex...
2   savoytruffle  AskReddit AskReddit AskReddit AskReddit AskRed...
3   hentercenter  stlouisblues StLouis stlouisblues stlouisblues...
4   rick-o-suave                           army AskReddit army army


In [28]:
test_features = subreddit_data_test["subreddit"].values
#tf-idf on test features
test_features = vectorizer_sub.transform(test_features)
features_dict = {}

for author, group in test_data.groupby('author', sort=False):
    features_dict[author] = extract_features(group)
X_test = sparse.vstack([features_dict[author] for author in unique_authors_test])
test_set=sparse.hstack([test_features, X_test])
multinomial_test_pred=multi_clf.predict_proba(test_set.toarray())[:,1]

In [29]:
#processing data for DNN

X_test_dense = X_test.toarray()

test_set=tf.data.Dataset.from_tensor_slices(X_test_dense).batch(BATCH_SIZE)
DNN_test_prediction=DNN.predict(test_set).flatten()

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step


In [30]:
#processing data for multinomial on text
unique_authors_test = test_data['author'].drop_duplicates()
test_data=test_data[["author", "subreddit", "body"]].fillna('').astype(str)
grouped_test_data=test_data.groupby('author', sort=False).agg({'body': ' '.join})
test_features = grouped_test_data["body"].apply(remove_stopwords).values
test_features = vectorizer_text.transform(test_features)
text_test_pred=MultiNB_clf.predict_proba(test_features)[:,1]

In [31]:
#stacking
test_predictions=np.vstack((multinomial_test_pred, DNN_test_prediction, text_test_pred)).T
print(test_predictions[:5])
lr_clf.predict_proba(test_predictions)[:,1]

[[1.00000000e+00 9.02094245e-01 9.83830614e-01]
 [2.67651383e-11 3.56406011e-02 2.94375159e-03]
 [2.76236103e-04 8.48236009e-02 5.83538062e-03]
 [1.01558400e-04 1.29280165e-01 5.85366591e-02]
 [1.10529113e-01 3.67660642e-01 8.57781449e-02]]


array([0.99664566, 0.0119328 , 0.01571273, ..., 0.08143475, 0.32380457,
       0.10554807])

In [32]:
gender_predictions=lr_clf.predict_proba(test_predictions)[:,1]

In [33]:
#create solution dataframe
solution = pd.DataFrame({"author":unique_authors_test, "gender":gender_predictions})
solution.head()

Unnamed: 0,author,gender
0,ejchristian86,0.996646
1,ZenDragon,0.011933
2,savoytruffle,0.015713
3,hentercenter,0.024153
4,rick-o-suave,0.108281


In [34]:
solution.to_csv("submission.csv", index=False)