# 1- Load the required modules

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import sklearn.model_selection as cv
import sklearn
# You must install the library colour and colorclassifier:
# Run the command "pip install colour colorclassifier colormath==1.0.8
from colour import Color
from colorclassifier import Classifier
from datetime import datetime

# 2- Load csv file with preprocessed location

In [None]:
ENCODING = 'latin1'

# Read csv file with preprocessed locations
# The original locations were imputed to country locations
data = pd.read_csv("tweet_location_preprocessing/gender-classifier-tweet-location-preprocessed-0-18836.csv", encoding=ENCODING)

# In addition rows with gender == 'unknown' and gender == NaN has been deleted
assert data[~data.gender.isin(['male', 'female', 'brand'])].empty

data.head()

# 3- Select attrs that will be used

In [None]:
# Drop attributes that we won't use
data = data.drop(['Unnamed: 0', '_unit_id', '_golden', '_unit_state', '_trusted_judgments', '_last_judgment_at', 'gender:confidence', 'profile_yn', 'profile_yn:confidence', 'gender_gold', 'profile_yn_gold', 'profileimage', 'tweet_id'], axis=1);
data.head()

# 4- Preprocess fields with string values and transform coordinates to binary

In [None]:
# Description attr treatment. Change to number of words writed
for index, row in data.iterrows() :
    data.set_value(index, 'description', len(str(row['description'])))
    data.set_value(index, 'text', len(str(row['text'])))
    data.set_value(index, 'name', len(str(row['name'])))
    data.set_value(index, 'tweet_coord', "false" if pd.isnull(row['tweet_coord']) else "true")

data.rename(columns={'description': 'description_length', 'text': 'text_length', 'name': 'name_length'}, inplace=True)
data.head()

# 5- Change color values to color categories

In [None]:
def hexToColor(hex):
    l = len(hex)
    if (l > 6): hex = '0' * 6
    elif (l < 6): hex = hex + '0' * (6 - l)
    rgb = Color('#' + hex).rgb
    i = lambda f: int(round(f))
    return Classifier(rgb = [i(rgb[0]*255), i(rgb[1]*255), i(rgb[2]*255)]).get_name()

data.sidebar_color = data.sidebar_color.apply(hexToColor)
#Hay 14 colores distintos sin contar los valores del estilo "1.10E+17" para los que se ha puesto "0"
#Son los colores de la libreria Classifier, de los que se elige el más similar.
print("Sidebar colors:")
print(data.sidebar_color.unique())

data.link_color = data.link_color.apply(hexToColor)
print("Link colors:")
print(data.link_color.unique())

time = lambda t: 'Unknown' if pd.isnull(t) else t
data.user_timezone = data.user_timezone.apply(time)
data.head()

# 6- Change create date values to categories

In [None]:
# Tuits are only created at midday (between 12h - 13h) probably the time when the tuits were obtained
def whichTimeRange(hour):
    assert hour >= 0 and hour < 24
    if (hour in range(1, 6)): return 'early_morning'
    if (hour in range(6, 12)): return 'morning'
    if (hour in range(12, 14)): return 'midday'
    if (hour in range(14, 21)): return 'afternoon'
    return 'evening' # 21 .. 0

def dateToRange(date):
    return whichTimeRange(datetime.strptime(date, '%m/%d/%y %H:%M').hour)

data.created = data.created.apply(dateToRange)
data.tweet_created = data.tweet_created.apply(dateToRange)

In [None]:
data.head()

# 7- Export CSV