# Text analyzing
# Is it possible to predict how popular the tweet will be?

In [3]:
import pickle as pk

import sys
import numpy as np
import scipy.sparse
import random as rnd
import sklearn.linear_model
import sklearn.datasets
import sklearn.svm
import sklearn.metrics
import sklearn.decomposition
import sklearn.feature_extraction.text
import sklearn.utils.sparsefuncs
from twython import Twython

from sklearn.preprocessing import KBinsDiscretizer

In [22]:
def run_prediction(filename):
    with open(filename, 'rb') as fh:
        data = pk.load(fh)

    # test data extraction
    data_text = [e['text'] for e in data]
    vectorizer = sklearn.feature_extraction.text.CountVectorizer\
        (stop_words='english', ngram_range=(1, 1), dtype='double')
    data_text = vectorizer.fit_transform(data_text)
    pca = sklearn.decomposition.TruncatedSVD(n_components=50)
    data_text = pca.fit_transform(data_text)

    # target extraction
    popularity_rank = [e['favorite_count'] * e['retweet_count'] for e in data]
    enc = KBinsDiscretizer(n_bins=10, encode='ordinal')
    popularity_rank = np.array(popularity_rank).reshape(-1, 1)
    target = enc.fit_transform(popularity_rank)

    # calculate linear regression
    n_samples = len(data_text)
    regression = sklearn.linear_model.LinearRegression()
    regression.fit(data_text[:n_samples // 2], target[:n_samples // 2])

    # predict
    expected = target[n_samples // 2:]
    predicted = regression.predict(data_text[n_samples // 2:])                                       

    # permutation tests
    r = [rnd.randrange(0, 10) for _ in range(n_samples // 2)]
    t1 = sum([abs(k-l) for k, l in zip(expected, predicted)]) / len(r)
    t2 = sum([abs(k-l) for k, l in zip(expected, r)]) / len(r)
    return t2, t1

In [34]:
random_score, machine_score = run_prediction("elonmuskdata.bin")
print('Random score: {0} | Our score: {1}'.format(random_score, machine_score))

Random score: [3.30201342] | Our score: [2.27193441]
