In [1]:
import csv
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv("data/train_with_embeddings.csv")

In [3]:
# new features
train_data["url_count"] = train_data["urls"].apply(lambda s: s[1:-1].count("\'")/2)
train_data["text_len"] = train_data["text"].apply(lambda s: len(s))
train_data["hashtags_count"] = train_data["hashtags"].apply(lambda s: s[1:-1].count("\'")/2)
train_data["day"] = train_data["timestamp"].apply(lambda t: datetime.utcfromtimestamp(t/1000).day)
train_data["hour"] = train_data["timestamp"].apply(lambda t: datetime.utcfromtimestamp(t/1000).hour)

# indicators of keywords
train_data["Macron"] =  train_data["text"].apply(lambda s: ("macron" in s.lower().split()))
train_data["Zemmour"] =  train_data["text"].apply(lambda s: ("zemmour" in s.lower().split()))
train_data["Melenchon"] =  train_data["text"].apply(lambda s: ("melenchon" in s.replace("é","e").lower().split()))
train_data["rt"] =  train_data["text"].apply(lambda s: ("rt" in s.lower().split()))


In [4]:
# TODO: include sent analysis

In [5]:
# select useful columns
train_data_filtered = train_data.drop(["text", "urls", "mentions", "hashtags", "timestamp", "TweetID"], axis=1)
# train_data_filtered = train_data.loc[:, ["retweets_count","favorites_count","followers_count","statuses_count","friends_count",
#                                  "hashtags_count","hour","verified","url_count","text_len","rt","Macron","Zemmour","Melenchon"]]


X_train, X_eval, y_train, y_eval = train_test_split(train_data_filtered.drop("retweets_count", axis=1),
                                                    train_data_filtered["retweets_count"],
                                                    random_state=42, test_size=0.1)

# Standardize the data
normal_columns = train_data_filtered.drop(["hour", "verified", "Macron", "Zemmour", "Melenchon", "url_count", "rt", "retweets_count"], axis=1).columns
mu, sigma = X_train[normal_columns].mean(axis=0), X_train[normal_columns].std(axis=0)
X_train.loc[:, normal_columns] = (X_train[normal_columns] - mu) / sigma
X_eval.loc[:, normal_columns] = (X_eval[normal_columns] - mu) / sigma

In [6]:
import torch
import pytorch_lightning as pl
from setup_data import TweetsDataset,TweetsDatasetEmbedded
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, random_split
from tweet_pred_model import TweetPredModel

device = torch.device("cpu")

In [7]:
# Importing model
model = TweetPredModel(embed_len=20, data_len=train_data_filtered.shape[1]-21, dense_size=128)    # 21 to account for target column


In [8]:
dataset = TweetsDatasetEmbedded(train_data_filtered, 20)

# Split into training and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
trainset, testset = random_split(dataset, [train_size, test_size])

# # Dataloaders
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, num_workers=12)
testloader = DataLoader(testset, batch_size=32, shuffle=False, num_workers=12)

# train(train_data_filtered, 3)
trainer = pl.Trainer(limit_train_batches=100, max_epochs=2)
trainer.fit(model=model, train_dataloaders=trainloader, val_dataloaders=testloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name     | Type       | Params
----------------------------------------
0 | textConv | Sequential | 264   
1 | linear3  | Linear     | 12.3 K
2 | linear4  | Linear     | 16.5 K
3 | linear5  | Linear     | 129   
----------------------------------------
29.2 K    Trainable params
0         Non-trainable params
29.2 K    Total params
0.117     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Validation: 0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

`Trainer.fit` stopped: `max_epochs=2` reached.


# Generate predictions

In [9]:
eval_data = pd.read_csv("data/evaluation_with_embeddings.csv")
tweet_ids = eval_data['TweetID']

In [10]:
# new features
eval_data["url_count"] = eval_data["urls"].apply(lambda s: s[1:-1].count("\'")/2)
eval_data["text_len"] = eval_data["text"].apply(lambda s: len(s))
eval_data["hashtags_count"] = eval_data["hashtags"].apply(lambda s: s[1:-1].count("\'")/2)
eval_data["day"] = eval_data["timestamp"].apply(lambda t: datetime.utcfromtimestamp(t/1000).day)
eval_data["hour"] = eval_data["timestamp"].apply(lambda t: datetime.utcfromtimestamp(t/1000).hour)

# indicators of keywords
eval_data["Macron"] =  eval_data["text"].apply(lambda s: ("macron" in s.lower().split()))
eval_data["Zemmour"] =  eval_data["text"].apply(lambda s: ("zemmour" in s.lower().split()))
eval_data["Melenchon"] =  eval_data["text"].apply(lambda s: ("melenchon" in s.replace("é","e").lower().split()))
eval_data["rt"] =  eval_data["text"].apply(lambda s: ("rt" in s.lower().split()))


In [11]:
# select useful columns
eval_data_filtered = eval_data.drop(["text", "urls", "mentions", "hashtags", "timestamp", "TweetID"], axis=1)

# Standardize the data
normal_columns = eval_data_filtered.drop(["hour", "verified", "Macron", "Zemmour", "Melenchon", "url_count", "rt"], axis=1).columns
eval_data_filtered.loc[:, normal_columns] = (eval_data_filtered[normal_columns] - mu) / sigma

In [12]:
dataset = TweetsDatasetEmbedded(eval_data_filtered, 20)
# loader = DataLoader(dataset, shuffle=False, num_workers=12)

Target column not present


In [13]:
# pred = model(loader)
# pred = model.predict(eval_data.values.astype(np.float32))

In [14]:
n_samples = eval_data.shape[0]

In [20]:
with open("data/predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "retweets_count"])
    # for index, prediction in enumerate(pred):
    #     writer.writerow([str(tweet_ids.iloc[index]) , str(int(prediction))])
    for index in range(n_samples):
        text, data = dataset.__getitem__(index)
        print(index, text, data)
        pred = model.forward(torch.from_numpy(text), torch.from_numpy(data))
        writer.writerow([str(tweet_ids.iloc[index]) , str(int(pred))])

0 [[-0.20457347  0.68767184 -0.35723564  0.28557587  0.6138217   0.4422944
  -0.89206773 -1.1100241   1.4433331  -0.3669716  -0.40171394  0.8294756
  -0.5305932   0.26922104 -1.0983874  -0.34983534 -0.41249552  0.7148075
   0.25872087  0.35057232]] [-1.7316911  -0.0550886  -0.07756063 -0.38448727 -0.4514297   0.
  0.         -1.057548    2.2080305   1.0834569  12.          0.
  0.          0.          0.        ]


RuntimeError: running_mean should contain 20 elements not 4