In [1]:
import re
import csv
import numpy as np
import pandas as pd
from datetime import datetime
from functools import partial
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
from stop_words import STOP_WORDS_FR
from ebbe import distinct
from scipy import sparse
from operator import itemgetter
from sklearn.neighbors import NearestNeighbors, DistanceMetric

In [2]:
with open('../data/event2018.tsv') as f:
    TWEETS = sorted(
        distinct(csv.DictReader(f, delimiter='\t'), key=itemgetter('id')),
        key=itemgetter('id')
    )

In [3]:
TWEETS = [t for t in TWEETS if t['label']]

In [4]:
def find_date_created_at(created_at):
    if "+0000" in created_at:
        d = datetime.strptime(created_at, TWITTER_DATE_FORMAT)
    else:
        d = datetime.strptime(created_at, STANDARD_DATE_FORMAT)
    return d.strftime("%Y%m%d"), d.strftime("%H:%M:%S")


def remove_repeted_characters(expr):
    #limit number of repeted letters to 3. For example loooool --> loool
    string_not_repeted = ""
    for item in re.findall(r"((.)\2*)", expr):
        if len(item[0]) <= 3:
            string_not_repeted += item[0]
        else:
            string_not_repeted += item[0][:3]
    return string_not_repeted


def camel_case_split(expr):
    matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', expr)
    return " ".join([m.group(0) for m in matches])


def format_text(text, **format):
    # remove urls
    text = re.sub(r"http\S+", '', text, flags=re.MULTILINE)
    if format["remove_mentions"]:
        text = re.sub(r"@\S+", '', text, flags=re.MULTILINE)
    # translate to equivalent ascii characters
    if format["unidecode"]:
        text = unidecode(text)

    new_text = []
    for word in re.split(r"[' ]", text):
        # remove numbers longer than 4 digits
        if len(word) < 5 or not word.isdigit():
            if word.startswith("#") and format["hashtag_split"]:
                new_text.append(camel_case_split(word[1:]))
            else:
                new_text.append(word)
    text = remove_repeted_characters(" ".join(new_text))
    if format["lower"]:
        text = text.lower()
    return text

In [5]:
formatter = partial(format_text, remove_mentions=True, unidecode=True, lower=True, hashtag_split=True)

In [6]:
formatter('#Rennes - La sortie de prison de Djamel Beghal [Vidéo exclusive] via @letelegramme https://t.co/tbOthY1Ren')

'rennes - la sortie de prison de djamel beghal [video exclusive] via  '

In [7]:
vectorizer = CountVectorizer(stop_words=STOP_WORDS_FR, binary=True)

In [8]:
data = [formatter(t['text']) for t in TWEETS]
vectorizer.fit(data)
X = vectorizer.transform(data)

In [9]:
len(vectorizer.get_feature_names())

55684

In [13]:
np.asarray(X.sum(axis=0))[0,vectorizer.vocabulary_['letelegramme']]

1

In [46]:
transformer = TfidfTransformer()
X2 = transformer.fit_transform(X)

In [113]:
transformer.idf_[vectorizer.vocabulary_['benalla']]

3.222393863862515

In [115]:
v = []
for d in range(81987):
    t = X2[0,d]
    if t:
        v.append((d, t))
v

[(11217, 0.37215521880488034),
 (25768, 0.3724594746316988),
 (30739, 0.4844182110947467),
 (61170, 0.30288816404656704),
 (65496, 0.43935294959758503),
 (71481, 0.365063013301087),
 (79325, 0.2647077099615123)]

In [80]:
vectorizer.get_feature_names()[79325]

'video'

In [103]:
df = np.asarray(X.sum(axis=0))[0]

other_transformer = TfidfTransformer()
idf = np.log((len(TWEETS) + 1) / (df + 1)) + 1
diag = sparse.diags(idf, offsets=0, shape=(len(df), len(df)), format="csr", dtype=df.dtype)
other_transformer._idf_diag = diag
X3 = other_transformer.fit_transform(X)

In [112]:
other_transformer.idf_[vectorizer.vocabulary_['benalla']]

3.222393863862515

In [116]:
v = []
for d in range(81987):
    t = X3[0,d]
    if t:
        v.append((d, t))
v

[(11217, 0.37215521880488034),
 (25768, 0.3724594746316988),
 (30739, 0.4844182110947467),
 (61170, 0.30288816404656704),
 (65496, 0.43935294959758503),
 (71481, 0.365063013301087),
 (79325, 0.2647077099615123)]

## Direct solution

In [14]:
TWITTER_DATE_FORMAT = "%a %b %d %H:%M:%S +0000 %Y"
STANDARD_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"

def load_dataset(dataset, annotation, text=False):
    data = pd.read_csv(dataset,
                       sep="\t",
                       quoting=csv.QUOTE_ALL,
                       dtype={"id": str, "label": float, "created_at": str, "text": str}
                       )
    data.text = data.text.fillna("")
    if annotation == "annotated" and "label" in data.columns:
        data = data[data.label.notna()]
    elif annotation == "examined" and "label" in data.columns:
        data = data[data.event.notna()]
    if dataset == "data/event2018_image":
        data = data[data.image.notna()]

    if text == "text+" and "text+quote+reply" in data.columns:
        data = data.rename(columns={"text": "text_not_formated", "text+quote+reply": "text"})
    data["date"], data["time"] = zip(*data["created_at"].apply(find_date_created_at))
    return data.drop_duplicates("id").sort_values("id").reset_index(drop=True)

In [15]:
class TfIdf:
    def __init__(self, lang="fr", binary=True):
        self.df = np.array([])
        self.features_names = []
        self.n_samples = 0
        self.name = "tfidf"
        self.binary = binary
        if lang == "fr":
            self.stop_words = STOP_WORDS_FR
        elif lang == "en":
            self.stop_words = STOP_WORDS_EN

    def load_history(self, lang):
        if lang == "fr":
            dataset = "event2018"
        else:
            dataset = "event2012"
        for attr in ["df", "features_names", "n_samples"]:
            with open("twembeddings/models/" + dataset + "_" + attr, "rb") as f:
                setattr(self, attr, pickle.load(f))
        return self

    def save(self, dataset):
        dataset = dataset.split("/")[-1].replace(".tsv", "")
        for attr in ["df", "features_names", "n_samples"]:
            with open("twembeddings/models/" + dataset + "_" + attr, "wb") as f:
                pickle.dump(getattr(self, attr), f)

    def get_new_features(self, data):
        features_set = set(self.features_names)
        fit_model = CountVectorizer(stop_words=self.stop_words)
        # see https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af for custom analyzr/tokenizr
        fit_model.fit(data["text"].tolist())
        for term in fit_model.get_feature_names():
            if term not in features_set:
                self.features_names.append(term)

    def build_count_vectors(self, data):
        # sort words following features_name order, absent words will be counted as 0
        count_model = CountVectorizer(binary=self.binary, vocabulary=self.features_names)
        return count_model.transform(data["text"].tolist())

    def compute_df(self, count_vectors):
        # add zeros to the end of the stored df vector
        zeros = np.zeros(count_vectors.shape[1] - len(self.df), dtype=self.df.dtype)
        df = np.append(self.df, zeros)
        # compute new df array
        # np.bincount counts each time an index is present in count_vectors.indices
        # however it does not count "zero" for absent words
        # therefore we artificially add all indices: np.arange(count_vectors.shape[1])
        # and then substract 1 to all indices in the total score
        indices = np.hstack((count_vectors.indices, np.arange(count_vectors.shape[1])))
        df = df + np.bincount(indices) - 1
        return df

    def add_new_samples(self, data):
        self.get_new_features(data)
        count_vectors = self.build_count_vectors(data)
        self.df = self.compute_df(count_vectors)
        # logging.info("Count matrix shape: {}".format(count_vectors.shape))
        return count_vectors

    def compute_vectors(self, count_vectors, min_df, svd=False, n_components=0):
        if min_df > 0:
            mask = self.df > min_df
            df = self.df[mask]
            count_vectors = count_vectors[:,mask]
        else:
            df = self.df
        self.n_samples += count_vectors.shape[0]
        # logging.info("Min_df reduces nb of features, new count matrix shape: {}".format(
        #     count_vectors.shape)
        # )
        # compute smoothed idf
        idf = np.log((self.n_samples + 1) / (df + 1)) + 1
        transformer = TfidfTransformer()
        transformer._idf_diag = sparse.diags(idf, offsets=0, shape=(len(df), len(df)), format="csr", dtype=df.dtype)
        X = transformer.transform(count_vectors)
        # equivalent to:
        # X = normalize(X * transformer._idf_diag, norm='l2', copy=False)
        if svd:
            logging.info("Performing dimensionality reduction using LSA")
            svd = TruncatedSVD(n_components=n_components, random_state=42)
            normalizer = Normalizer(copy=False)
            lsa = make_pipeline(svd, normalizer)
            X = lsa.fit_transform(X)
            logging.info("New shape: {}".format(X.shape))

        return X

In [17]:
data = load_dataset('../data/event2018.tsv', 'annotated')

In [18]:
veco = TfIdf(lang='fr', binary=True)
data.text = data.text.apply(format_text,
                            remove_mentions=True,
                            unidecode=True,
                            lower=True,
                            hashtag_split=True
                            )

In [19]:
count_matrix = veco.add_new_samples(data)

In [21]:
XX = veco.compute_vectors(count_matrix, min_df=10, svd=False)

In [24]:
XX

<95796x9803 sparse matrix of type '<class 'numpy.float64'>'
	with 870508 stored elements in Compressed Sparse Row format>

In [164]:
v = []
for d in range(13486):
    t = XX[0,d]
    if t:
        v.append((d, t))
v

[(1701, 0.37215521880488034),
 (4218, 0.3724594746316988),
 (5045, 0.4844182110947467),
 (10017, 0.30288816404656704),
 (10814, 0.43935294959758503),
 (11872, 0.365063013301087),
 (13104, 0.2647077099615123)]

In [31]:
veco.df

array([ 81., 637.,   4., ...,   1.,   4.,   1.])

In [34]:
veco.n_samples

95796

In [28]:
veco.df[veco.features_names.index('benalla')]

12520.0

In [160]:
veco.df[11385]

14925.0

In [171]:
def cosine_distances(x, y, intel_mkl=False):
    x_normalized = normalize(x, copy=True)
    y_normalized = normalize(y, copy=True)
    if intel_mkl:
        # s = dot_product_mkl(x_normalized, y_normalized.T.tocsr(), dense=True)
        pass
    else:
        s = (x_normalized * y_normalized.T).toarray()
    s *= -1
    s += 1
    np.clip(s, 0, 2, out=s)
    if x is y or y is None:
        # Ensure that distances between vectors and themselves are set to 0.0.
        # This may not be the case due to floating point rounding errors.
        s[np.diag_indices_from(s)] = 0.0
    return s

In [191]:
distances = cosine_distances(XX[:10], XX[10:30])
neighbors = distances.argmin(axis=1)

In [197]:
distances[range(distances.shape[0]), neighbors]

(array([0.54289531, 0.56380689, 0.68234967, 1.        , 0.61963932,
        0.        , 0.85527133, 0.90336077, 0.88501953, 0.0980216 ]),
 array([ 3, 15, 19,  0, 15, 12, 15,  7, 15,  9]))

In [201]:
data.iloc[5]['text'], data.iloc[22]['text']

('algerie : le taux de natalite, parmi les plus eleves au monde, inquiete les autorites  via ',
 'parmi les plus eleves au monde, le taux de natalite en algerie inquiete les autorites ')