In [None]:
!pip install emoji

In [None]:
!pip install pymorphy2

In [None]:
import emoji

In [None]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [None]:
import re
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

In [None]:
#dir = '/home/username/authorship_attribution
dir = '/home/alissia/authorship_attribution'
flist = [join(dir+'/dataset', f) for f in listdir(dir) if isfile(join(dir+'/dataset', f))]

In [None]:
lenfiles = []
for file in flist: 
    df = pd.read_csv(join(dir+'/dataset', file), sep=',', encoding='utf-8')
    lenfiles.append(len(df))
files = sorted(list(zip(lenfiles, flist)), reverse=True)[:50] 
# dataset contains more than 50 files but for experiments it is more convenient to use smaller dataset
# 1 author = 1 file of 3200 tweets (the limit of API Twitter), one tweet per line
# 2 columns: username and text with delimiter ','

In [None]:
with open(join(dir,'kaomodji.txt'), 'r') as f:
    kaomodji = [k.strip() for k in f.readlines()]

In [None]:
# lemmatization -- optional
def lemm(word):
    return morph.parse(word.lower())[0].normal_form

In [None]:
def preprocess(tweet):  
    tweet = re.sub("@\w+", "_mention_", tweet)
    tweet = re.sub("https?:\/\/[^\s]*", "_URL_", tweet)
    tweet = re.sub("#[^\s]*", "_hashtag_", tweet)
    tweet = re.sub(":.*:", "_emodji_", emoji.demojize(tweet))
    tweet = tweet + " ".join(['_kaomodji_' for i in tweet.split(' ') if i in kaomodji])
    tweet = re.sub(":-?\)|:-?\(|:-?O|:-?Ъ|\)-?:|\(-?:|=\)|=\(|\)=|\(=|;-?\)|;-?\(", "_emoticon_", tweet)
    tweet = re.sub("-|−|–|--", "—", tweet)
    tweet = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", tweet)
    return " ".join([lemm(word) for word in tweet.split(" ")])

In [None]:
# normalizing the length of tweet which is is 1-270 chars (from 2000-3000+ lines to 1000)
x = []
y = []

for i in range(len(files)): 
    tweets = []
    df = pd.read_csv(join(dir+'/dataset', files[i][1]), sep=',', encoding='utf-8')
    y += df['Username'].tolist()[:1000]
    # exclude tweets shorter than 3 words
    data = [preprocess(tweet) for tweet in df['Tweets'].tolist()]
    # the value is not more than 3200 (the limit of API Twitter)
    if len(data) > 1999: 
        for i in range(0,2000,2):
            tweets.append(data[i] + ' ' + data[i+1])
        res = len(data) - 2000
    # the authors were selected to exclude the accounts with less than 1000 tweets
    else:  
        tweets = data[:1000]
        res = len(data) - 1000
    data = data[-res:]
    if res > 1000:
        for i in range(1000):
            tweets[i] = tweets[i] + ' ' + data[i]
        res = res - 1000
        data = data[-res:]
        for i in range(res):
            tweets[i] = tweets[i] + ' ' + data[i]
    else:
        for i in range(res):
            tweets[i] = tweets[i] + ' ' + data[i]

    x += tweets

In [None]:
assert(len(y))==len(x)
len(x)

In [None]:
data = list(zip(x,y))
df = pd.DataFrame(data, columns = ['text', 'author'])
df.head()

In [None]:
X, y = df['text'], df['author']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
print(len(list(X_train))) # 75%
print(len(list(X_test))) # 25%

In [None]:
authors = sorted(set(y_test))
print(len(authors)) # 50

In [None]:
# train set
with open(join(dir, 'ebd_train.txt'), 'w', encoding='utf-8') as f:
    for line in list(X_train):
        f.write(line+'\n')

In [None]:
# test set
mixed = sorted(list(zip(y_test, X_test)))
for author in authors:
    with open(join(dir+'ebd_data', author+'.txt'), 'w', encoding='utf-8') as f:
        for i in range(len(mixed)):
            if mixed[i][0] == author:            
                f.write(str(mixed[i][1])+'\n')