The sections that are commented out were left in to allow for the creation of an uneven, but larger, dataset.

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.python.lib.io.tf_record import TFRecordWriter

from sklearn.model_selection import train_test_split

import json

# CHANGE THESE TO TRAINING DATA FILEPATH AND OUT-FILEPATH
prelabeled_tweets = '../data/prelabeled/tweets_47k.csv'

OUTFILE_prefix = '../data/prelabeled/'

In [2]:
DF = pd.read_csv(prelabeled_tweets)
DF.shape

(47408, 2)

In [3]:
DF.head()

Unnamed: 0,Stance,Tweet
0,0,RT @redostoneage: NASAs changing Facts; Guess ...
1,0,Air ban led by flawed computer models. Global ...
2,0,@jiminhofe to Address Global Warming Skeptics ...
3,0,"Peoples World: GRAND RAPIDS, Mich. - Opponents..."
4,0,"In a half hour, Carol Browner, Assistant to th..."


In [4]:
DF.Stance.unique()
DF.Stance = DF.Stance.astype('int32', copy = False)

In [5]:
def clean_tweets():
    '''
    Takes the DF defined above and (in this order) applies the following preprocessing steps:
    1. Remove cases
    2. Replaces and URL's with "LINK"
    3. Replaces any twitter handels with "USERNAME"
    4. Removes any punctuation
    
    Note: Stop words will not be removed in this iteration because they may add some information.
    '''
    # Remove cases from the tweets
    DF.Tweet = DF.Tweet.str.lower()
    
    # Remove URL links
    DF.Tweet = DF.Tweet.str.replace('http\S+|www.\S+', 'LINK', case = False)
    
    # Remove usernames
    DF.Tweet = DF.Tweet.str.replace('@.*w', 'USERNAME ', case = False)
    
    # Remove #'s? - Uncomment next line if you aren't using the next filter
#     DF.Tweet = DF.Tweet.str.replace('#', '', case = False)
    
    # Remove remaining punctuation
    DF.Tweet = DF.Tweet.str.replace('[^\w\s]', '')
    
    # Convert Stance to a numerical val - Alread done for current DF
    # stances = {'NONE':0, 'AGAINST':-1, 'FAVOR':1}
    # DF.Stance =DF.Stance.map(stances)
    # DF.astype({'Stance': 'int32'}, copy = False)
    
clean_tweets()

In [6]:
print(f"0's: {(DF.Stance == 0).sum()}")
print(f"1's: {(DF.Stance == 1).sum()}")
print(f"-1's: {(DF.Stance == -1).sum()}")

0's: 11642
1's: 29519
-1's: 6247


In [7]:
print(DF.Stance.shape)
print(DF.dtypes)

(47408,)
Stance     int32
Tweet     object
dtype: object


In [8]:
# Sampling 6247 from each label
df_pos = DF[DF.Stance == 1].sample(6247, replace = False)
df_neu = DF[DF.Stance == 0].sample(6247, replace = False)
df_neg = DF[DF.Stance == -1].sample(6247, replace = False)
print(df_pos.shape, df_neu.shape, df_neg.shape)

(6247, 2) (6247, 2) (6247, 2)


In [9]:
df = pd.concat([df_pos, df_neu, df_neg])
print(df.shape)

(18741, 2)


In [11]:
# Make All
# X_train, X_test, y_train, y_test = train_test_split(DF.Tweet, DF.Stance, test_size = .2, shuffle = True)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .2, shuffle = True)


# Make evenly classed subsample 
X_train, X_test, y_train, y_test = train_test_split(df.Tweet, df.Stance, test_size = .2, shuffle = True)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .2, shuffle = True)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(11993,)
(3749,)
(2999,)
(11993,)
(3749,)
(2999,)


In [13]:
train = pd.DataFrame(np.array([X_train, y_train]).T)
test = pd.DataFrame(np.array([X_test, y_test]).T)
val = pd.DataFrame(np.array([X_val, y_val]).T)

In [14]:
train

Unnamed: 0,0,1
0,rt USERNAME dump on climate change narrative ...,-1
1,rt USERNAME eet LINK,-1
2,tuesday open thread climate change edition b...,0
3,tonight at 630pm dont miss out on gri_lse publ...,1
4,favor,1
...,...,...
11988,rt USERNAME can plants adjust to climate change,1
11989,USERNAME arming tcot,-1
11990,USERNAME arming,0
11991,rt USERNAME hy his embrace of science matters ...,1


In [15]:
train.shape

(11993, 2)

In [None]:
train_csv = train.values
test_csv = test.values
val_csv = val.values

In [None]:
def make_tf_ex(feats, lab):
    tf_ex = tf.train.Example(features = tf.train.Features(feature= {
        'idx' : tf.train.Feature(int64_list = tf.train.Int64List(value = [feats[0]])),
        'sentence' : tf.train.Feature(bytes_list = tf.train.BytesList(value = [feats[1].encode('utf-8')])),
        'label' : tf.train.Feature(int64_list = tf.train.Int64List(value = [lab]))
    }))
    
    return tf_ex

In [None]:
def convert_csv_to_tf_record(csv, file_name):
    writer = TFRecordWriter(file_name)
    for index,row in enumerate(csv):
        try:
            if row is None:
                print("row was None")
                raise Exception('Row Missing')
                
            if row[0] is None or row[1] is None:
                print("row[0] or row[1] was None")
                raise Exception('Value Missing')
                
            if row[0].strip() is '':
                print("row[0].strip() was ''")
                raise Exception('Utterance is empty')
                
            feats = (index, row[0])
            lab = row[1]
            example = make_tf_ex(feats, lab)
            writer.write(example.SerializeToString())

        except Exception as inst:
            print(type(inst))
            print(Exception.args)
            print(Exception.with_traceback)
            
    writer.close()

def generate_json_info(local_file_name):
    info = {"train_length": len(train),
            "val_length": len(val),
            "test_length": len(test)}

    with open(local_file_name, 'w') as outfile:
        json.dump(info, outfile)

In [None]:
# Make All

# convert_csv_to_tf_record(train_csv, "data/train_large.tfrecord")
# convert_csv_to_tf_record(test_csv, "data/test_large.tfrecord")
# convert_csv_to_tf_record(val_csv, "data/val_large.tfrecord")

# Make even subsample - ~18,000 in total
convert_csv_to_tf_record(train_csv, OUTFILE_prefix + "train47.tfrecord")
convert_csv_to_tf_record(test_csv, OUTFILE_prefix + "test47.tfrecord")
convert_csv_to_tf_record(val_csv, OUTFILE_prefix + "val47.tfrecord")

In [None]:
generate_json_info("../data/lengths/tweet47_info.json")