In [1]:
import sys

# uncomment any library you need to install and run the cell

#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install matplotlib
#!{sys.executable} -m pip install sklearn
#!{sys.executable} -m pip install emoji
#!{sys.executable} -m pip install gensim

#!{sys.executable} -m pip install spacy
#!{sys.executable} -m spacy download en_core_web_sm

#!{sys.executable} -m pip install nltk
#nltk.download('words')
#nltk.download('wordnet')

In [4]:
import sys
sys.path.append('../app')
from config import Config
from preprocessing import Preprocessing

In [5]:
config = Config()

In [6]:
preprocessor = Preprocessing(config)

## Run all at once, or process by process
### All at once, skips steps if file already exists, otherwise takes a long time

In [None]:
preprocessor.preprocess_data()

### Run process by process

In [7]:
# extract and aggregate raw data
preprocessor.extract_type_and_text()

Extracting raw troll tweets...
Extracted 2116866 troll tweets (badtabs: 0, badlines: 1, skipped nonenglish: 829340)
Extracting raw user tweets...
0
1000000
2000000
3000000
4000000
5000000
0
1000000
2000000
3000000
bad tabs: 217985, bad lines: 223
Extracted 8783464 normal user tweets (badtabs: 217985, badlines: 223)


In [7]:
# clean the data
preprocessor.clean_tweets()

Saving 2116866 cleaned troll tweets to ../data/troll_tweets_clean.pkl.gz
0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
Saving 8783464 cleaned user tweets to ../data/user_tweets_clean.pkl.gz


In [12]:
# use spacy NLP to tokenize tweets - takes a very long time
preprocessor.tokenize_tweets()

Storing troll tweets tagged so far: 0 commercial	<LINK> So, did you crush your workout? <USER> @_kaskp <USER> <USER> <USER> <USER> <USER>	<LINK> So , did you crush your workout ? <USER> @_kaskp <USER> <USER> <USER> <USER> <USER>	<LINK> so , do you crush your workout ? <USER> @_kaskp <USER> <USER> <USER> <USER> <USER>	TAG RB , VBD PRP VB PRP$ NN . TAG NNP TAG TAG TAG TAG TAG	your_workout	
Storing complete 100000 tagged troll tweets
Storing user tweets tagged so far: 0 NormalUser	Congrats to Toby Warrior Shaye G. of New York, NY, the winner of the Toby Keith Norway Flyaway Sweepstakes! - <LINK>	Congrats to Toby Warrior Shaye G. of New York , NY , the winner of the Toby Keith Norway Flyaway Sweepstakes ! - <LINK>	congrat to Toby Warrior Shaye G. of New York , NY , the winner of the Toby Keith Norway Flyaway sweepstake ! - <LINK>	NNS IN NNP NNP NNP NNP IN NNP NNP , NNP , DT NN IN DT NNP NNP NNP NNP NNS . : TAG	Toby_Warrior_Shaye_G. New_York the_winner the_Toby_Keith_Norway_Flyaway_Sweepsta

In [None]:
# calculate and store the feature records
preprocessor.get_features()

## Mix and store a subset of the data to work with

In [3]:
with gzip.open('../data/troll_features.pkl.gz', 'rb') as fz:
    troll_feats = pickle.load(fz)
troll_feats = troll_feats[:500000]
len(troll_feats)

500000

In [4]:
with gzip.open('../data/user_features.pkl.gz', 'rb') as fz:
    user_feats = pickle.load(fz)
user_feats = user_feats[:len(troll_feats)]
len(user_feats)

500000

In [5]:
random.shuffle(troll_feats)
random.shuffle(user_feats)

In [6]:
feats = troll_feats
while user_feats:
    feats.append(user_feats.pop())
random.shuffle(feats)
len(feats)

1000000

In [7]:
y = [1 if f['type']=='NormalUser' else 0 for f in feats]

In [8]:
with gzip.open(config.feature_x_path, 'wb') as oz:
    pickle.dump(feats, oz)

In [9]:
with gzip.open(config.feature_y_path, 'wb') as oz:
    pickle.dump(y, oz)