In [2]:
#Create process function and count occurs function

# Import libraries
import numpy as np
import pandas as pd
import re 
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer


## Process Text

# Import data
data = pd.read_csv('train.csv')
print(data.head(2))

# Process function 
def process_tweet(tweet) : 
    # This function take a tweet and process it.
    # input : a tweet
    # output : tweet clean : a list of words containing a processed tweet 

    stemmer = PorterStemmer()

    # English stopwords
    en_stopword = stopwords.words("english")

    # Remove hashtag 
    tweet = re.sub(r'#', '', tweet)

    # Remove "$"
    tweet = re.sub(r'\$\w*', '', tweet)
    
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)

    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweet_clean = [stemmer.stem(word) for word in tweet_tokens if (word not in en_stopword and word not in string.punctuation)]

    return tweet_clean


# Definition of 
def build_freqs(tweets, yslist):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    #yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(str(tweet)):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   

   target  
0       1  
1       1  


In [3]:
# Get tweet and disaster real

all_tweets = list(data.text)
yslist = list(data.target)

# Split data in train and test set 
cut = int(len(all_tweets)*3/4)
train_x = all_tweets[:cut]
train_y = yslist[:cut]

test_x = all_tweets[cut:]
test_y = yslist[cut :]

# Get a freqs of words with feature "text"
freqs = build_freqs(train_x, train_y)
print(len(freqs))


12929


In [4]:
## Modeling

# Define sigmoid function
def sigmoid(z) : 
    # input : float of array/list of float
    # output : array 

    if type(z) != np.ndarray : 
        z = np.array(z)
    return 1/(1+ np.exp(-z))

# Gradient descent function
def gradientDescent(X, Y, learning_rate = 0.01, iteration = 100) : 
    """ Input:
    X: matrix of features 
    Y: corresponding labels of the input matrix x, dimensions (m,1)
    theta: weight vector of dimension (n+1,1)
    alpha: learning rate
    num_iters: number of iterations you want to train your model for
    Output:
    J: the final cost
    theta: your final weight vector
    """
    if type(X) != np.ndarray : 
        X = np.array(X)

    if type(Y) != np.ndarray : 
        Y = np.array(Y)

    # Number of rows
    m = X.shape[0]

    # Theta
    #np.random.seed(1)
    #theta = np.random.rand(X.shape[1]).reshape(-1,1)
    theta = np.zeros((3,1))

    # Loop over iteration
    for i in range(iteration) : 
        #if i%70 : 
        #    print (J, theta)

        # Compute of prediction
        Z = np.dot(X, theta)

        pred = sigmoid(Z)

        # Compute cost function 
        J = -1/m * (np.dot(Y.T , np.log(pred)) + np.dot((1-Y).T , np.log(1 - pred)))

        # update the weights theta
        theta = theta - (learning_rate/m) * np.dot(X.T, (pred - Y))

        J = float(J)

    return J, theta

In [5]:
# Feature extraction

def extract_features(tweet, freqs) : 
    """
    iput : a tweet and a freqs dictionary of tuple
    output : a feature vector of dimension 3
    """
    list_of_word = process_tweet(tweet)
    x = np.ones(3).reshape(1,-1)

    # Loop on list of word from cleaned tweet
    for word in list_of_word :
        
        x[0,1]=  freqs[(word, 1)] if (word, 1) in freqs else 0
    
        x[0,2] +=  freqs[(word, 0)] if (word, 0) in freqs  else 0

    return x
    
# Test of function feature_extract
print(extract_features(train_x[0], freqs))


[[  1.  39. 113.]]


In [7]:
# Training model

# Extract feature for text
X = np.zeros((len(train_x), 3))
print(X.shape)

for i, tweet in enumerate(train_x) : 
    X[i,:] = extract_features(tweet, freqs)

print(X.shape)

(5709, 3)
(5709, 3)


In [8]:
#Tuning hyper parameter

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.base import BaseEstimator, RegressorMixin

class logisticRegression(BaseEstimator, RegressorMixin) :

    def __init__(self, learning_rate = 10**(-8), iteration = 1600) : 

        #self.X = X
        #self.Y = Y
        #self.theta = theta
        self.learning_rate = learning_rate
        self.iteration = iteration
        

    def fit(self, X, Y ) :

        # Check if train_x and train_y are np.ndarray
        #X = np.ones((len(X),3))

        #for i, tweet in enumerate(X) : 
        #    X[i,:] = extract_features(tweet, freqs)

        Y = np.array(Y).reshape(-1,1)

        assert( type(X) == np.ndarray)
        assert( type(X) == np.ndarray)

        self.J, self.theta = gradientDescent(X, Y, learning_rate = self.learning_rate, iteration = self.iteration)  
        return self

    def predict(self, X) : 
        """
        input : tweet : le tweet
        freqs : dictionary of tuple (word, label) and his frequency
        theta : a weight vector
        output  
        prob : probability a tweet concern a real disaster or not
        """ 
        #X = extract_features(tweet, freqs)

        prob = float(sigmoid(np.dot(X, self.theta)))

        return prob

    def score(self, X, Y) : 
        """
        input : 
        X : a list of tweet
        Y : label of tweet
        freqs : dictionary of pair (word, label) and his frequency
        theta_hat : weight vector
        output :
        accuracy 
        """
        num_correct = 0

        m = len(X)

        assert(len(X) == len(Y))

        for i, x in enumerate(X) :

            #feature = extract_features(tweet, freqs)

            prob = float(sigmoid(np.dot(x, self.theta)))

            pred_label = 1 if prob >= 0.5 else 0

            if Y[i] == pred_label : 
                num_correct += 1

        score = num_correct / m

        return score

In [10]:
# Model fitting

regLog = logisticRegression(learning_rate=10**-8)
reglog = regLog.fit(X, np.array(train_y))

# Printing of estimator theta and loss function J
print(reglog.theta, reglog.J)


[[-1.09244489e-06]
 [ 1.13821565e-04]
 [-2.92350928e-04]] 0.6869191645540338


In [23]:
# Test of regression model

# Modeling
#Y = np.array(train_y).reshape(-1,1)

#J, theta_hat = gradientDescent(X, Y, learning_rate = 0.000001, iteration = 400)
#print(theta_hat, J)

# Prediction
def predict_tweet(tweet, freqs, theta) : 
    """
    input : tweet : le tweet
    freqs : dictionary of pair (word, label) and his frequency
    theta : a weight vector
    output  
    prob : probability a tweet concern a real disaster or not
    """ 
    feature = extract_features(tweet, freqs)

    prob = float(sigmoid(np.dot(feature, theta)))

    return prob


    # Test of Regression logistique
def test_regression_logistique(test_x, test_y, freqs, theta) : 

    """
    input : 
    X : a list of tweet
    Y : label of tweet
    freqs : dictionary of pair (word, label) and his frequency
    theta_hat : weight vector
    output :
    accuracy 
    """
    num_correct = 0

    m = len(test_x) 

    assert(len(test_x) == len(test_y))

    for i, tweet in enumerate(test_x) : 
        prob = predict_tweet(tweet, freqs, theta)

        pred_label = round(prob)

        if test_y[i] == pred_label : 
            num_correct += 1

    accuracy = num_correct / m

    return accuracy

# Accuracy
accuracy = test_regression_logistique(test_x, test_y, freqs, reglog.theta)

print("The accuracy of model is {:4.2f}".format(accuracy))


The accuracy of model is 0.55


In [24]:
# Cross validation function

# Extract feature for text
XX = np.zeros((len(all_tweets), 3))
print(XX.shape)

for i, tweet in enumerate(all_tweets) : 
    XX[i,:] = extract_features(tweet, freqs)


scores = cross_val_score(regLog, XX, np.array(yslist), cv=5)
for score in scores :  
    print("Scores from cross validation {} are : {}.".format(i,score))

print('Model accuracy is : {}'.format(np.mean(scores)))


(7613, 3)
Scores from cross validation 7612 are : 0.6316480630334865.
Scores from cross validation 7612 are : 0.5883125410374261.
Scores from cross validation 7612 are : 0.525279054497702.
Scores from cross validation 7612 are : 0.5919842312746386.
Scores from cross validation 7612 are : 0.533508541392904.
Model accuracy is : 0.5741464862472314
