In [12]:
import pandas as pd
import numpy as np
import re
import unidecode
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import SnowballStemmer
import string
stopwords = nltk.corpus.stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

In [13]:
train = pd.read_csv('~/Documents/Datos/DataSets/TP2/train.csv')

In [14]:
def concatenate(x,char):
    words = ""
    for word in x:
        if word.startswith(char):
            words = words + word + " "
    return words

def count_vowels(x):
    return (x.count('a') + x.count('e') + x.count('i') + x.count('o') + x.count('u'))

def count_short_words(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if 1 <= len(word) <= 3:
            count += 1
    return count

def count_stopwords(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if word in stopwords:
            count += 1
    return count

In [15]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    stemmer = SnowballStemmer('english')
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
        word = stemmer.stem(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [16]:
train["special_chars_count"] =  train["text"]
train["special_chars_count"] =  train["special_chars_count"].str.lower()
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
train["special_chars_count"] = train["special_chars_count"].str.strip()
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(' +','', x))
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
train["special_chars_count"] = train["special_chars_count"].str.len()

train["hashtags"] = train["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
train["labels"] = train["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
train["hashtags_count"] = train["hashtags"].str.split(' ').apply(lambda x: len(x))-1
train["labels_count"] = train["labels"].str.split(' ').apply(lambda x: len(x))-1

train["num_chars_count"] = train["text"]
train["num_chars_count"] =  train["num_chars_count"].str.lower()
train["num_chars_count"] = train["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
train["num_chars_count"] = train["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
train["num_chars_count"] = train["num_chars_count"].str.strip()
train["num_chars_count"] = train["num_chars_count"].str.len()

train["links_count"] = train['text'].apply(lambda x: len([w for w in str(x).lower().split()
                                                           if 'http' in w or 'https' in w]))

train["clean_text"] = train["text"].apply(lambda x: cleaning_text(x)) # para el bag of words o tf-idf

train["semi_cleaned_text"] = train["text"].str.lower()
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: re.sub(r'_', ' ', x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: re.sub(' +',' ', x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].apply(lambda x: unidecode.unidecode(x))
train["semi_cleaned_text"] = train["semi_cleaned_text"].str.strip()
train["text_length"] = train["text"].str.len()

train["mean_word_length"] = train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
train["vowels_count"] = train["text"].apply(lambda x: count_vowels(x))
train["short_words_count"] = train["text"].apply(lambda x: count_short_words(x))
train["stopwords_count"] = train["text"].apply(lambda x: count_stopwords(x))
train["text"] = train["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
train["words_count"] = train["text"].str.split(' ').apply(lambda x: len(x))

train["keyword"] = train["keyword"].str.replace('%20',' ')
train["keyword"] = train["keyword"].astype('category')

train.rename(columns={"target":"target_label"}, inplace=True)
# Si usamos BoW o TF-IDF
train.rename(columns={"location":"location_original"}, inplace=True)
train.rename(columns={"id":"id_original"}, inplace=True)
train.rename(columns={"text":"text_original"}, inplace=True)
train.rename(columns={"keyword":"keyword_original"}, inplace=True)

train.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,links_count,clean_text,semi_cleaned_text,text_length,mean_word_length,vowels_count,short_words_count,stopwords_count,words_count
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,#earthquake,,1,0,0,0,deeds reason earthquake may allah forgive us,our deeds are the reason of this earthquake ma...,69,4.384615,22,7,5,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,,,0,0,0,0,forest fire near la ronge sask canada,forest fire near la ronge sask canada,38,4.571429,13,1,0,7
2,5,,,All residents asked to 'shelter in place' are ...,1,3,,,0,0,0,0,residents asked shelter place notified officer...,all residents asked to shelter in place are be...,133,5.090909,44,9,9,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,2,#wildfires,,1,0,5,0,people receive wildfires evacuation orders cal...,people receive wildfires evacuation orders in ...,65,7.125,24,1,1,9
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,2,#alaska #wildfires,,2,0,0,0,got sent photo ruby alaska smoke wildfires pou...,just got sent this photo from ruby alaska as s...,88,4.5,24,3,6,17


In [17]:
train.to_csv('~/Documents/Datos/DataSets/TP2/train_featured.csv', index=False)