# Course 1: Classification and Vector Spaces
# Week 1: Logistic Regression for Sentiment Analysis of Tweets
## Own project: Analyzing the sentiment of my own tweets

## Download my own tweets

In [1]:
%%capture
%run assignment.ipynb

In [2]:
import tweepy
import json

In [3]:
# Load Twitter credentials
api_keys = json.load(open('twitter-api-keys.local.json'))
bearer_token = api_keys["bearer_token"]
client = tweepy.Client(
    bearer_token=api_keys['bearer_token'], wait_on_rate_limit=True)

auth = tweepy.OAuthHandler(
    consumer_key=api_keys['api_key'],
    consumer_secret=api_keys['api_key_secret'],
    access_token=api_keys['access_token'],
    access_token_secret=api_keys['access_token_secret'])
api = tweepy.API(auth, wait_on_rate_limit=True)

In [4]:
me = "fcx_xm"
tweets = api.user_timeline(screen_name=me, count=1000)
tweet_text = [tweet.text for tweet in tweets]

## Creating preprocessing functions for Spanish tweets

In [5]:
import unidecode
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer("spanish")

nltk.download('punkt')

True

In [6]:
# String cleaning function
def clean_string(string):
    """
    Input:
        - string: a string to be cleaned
    Output:
        - cleaned_string: a clean string with the following actions:
            - lowercase
            - remove punctuation
            - remove URLs, hashtags, mentions
    """
    string = re.sub(r'http\S+', '', string)
    string = re.sub(r'RT', '', string)
    string = re.sub(r'@\S+', '', string)
    string = re.sub(r'#', '', string)
    string = re.sub(r'[^\w\s]', '', string)
    string = string.lower()
    string = unidecode.unidecode(string)

    return string

In [7]:
# Tokenize function
def tokenize(string):
    """
    Input:
        - string: a string to be tokenized
    Output:
        - tokens: a list of tokens
    """
    tokens = word_tokenize(string)

    return tokens

In [8]:
# Remove spanish stopwords function
def remove_stopwords(tokens):
    """
    Input:
        - tokens: a list of tokens
    Output:
        - tokens: a list of tokens without stopwords
    """
    stopwords = nltk.corpus.stopwords.words('spanish')
    tokens = [token for token in tokens if token not in stopwords]

    return tokens

In [9]:
# Stem function for spanish
def stem(tokens):
    """
    Input:
        - tokens: a list of tokens
    Output:
        - tokens: a list of stems
    """
    stems = [stemmer.stem(token) for token in tokens]

    return stems

In [10]:
# Show transformation progress
print('Original text:')
print(tweet_text[0])
print('Cleaned text:')
print(clean_string(tweet_text[0]))
print('Tokenized text:')
print(tokenize(clean_string(tweet_text[0])))
print('Removed stopwords:')
print(remove_stopwords(tokenize(clean_string(tweet_text[0]))))
print('Stemmed text:')
print(stem(remove_stopwords(tokenize(clean_string(tweet_text[0])))))

Original text:
hoy le he cambiado el filtro de la cabina al coche porque traia el original de hace 10 años xd https://t.co/TlSnID72OE
Cleaned text:
hoy le he cambiado el filtro de la cabina al coche porque traia el original de hace 10 anos xd 
Tokenized text:
['hoy', 'le', 'he', 'cambiado', 'el', 'filtro', 'de', 'la', 'cabina', 'al', 'coche', 'porque', 'traia', 'el', 'original', 'de', 'hace', '10', 'anos', 'xd']
Removed stopwords:
['hoy', 'cambiado', 'filtro', 'cabina', 'coche', 'traia', 'original', 'hace', '10', 'anos', 'xd']
Stemmed text:
['hoy', 'cambi', 'filtr', 'cabin', 'coch', 'trai', 'original', 'hac', '10', 'anos', 'xd']


In [24]:
# Entire process of the string
def process_text(tweet):
    """
    Input:
        - tweet: a string to be processed
    Output:
        - tokens: a list of tokens
    """
    tweet = clean_string(tweet)
    tokens = tokenize(tweet)
    tokens = remove_stopwords(tokens)
    tokens = stem(tokens)

    return tokens

In [25]:
# Processed tweets
processed_tweets = [process_text(tweet) for tweet in tweet_text]
print(processed_tweets[:5])

[['hoy', 'cambi', 'filtr', 'cabin', 'coch', 'trai', 'original', 'hac', '10', 'anos', 'xd'], ['da'], ['vist', 'llev'], ['myth', 'we', 'dont', 'hav', 'solution', 'to', 'nuclears', 'wast', 'problem', 'reality', 'nucl', 'wast', 'isnt', 'problem', 'in', 'fact', 'its', 'the', 'best'], ['mir', 'contraluz']]


## Getting data for spanish positivity and negativity

I will be using a dataset of positive and negative texts.

URL: https://www.kaggle.com/datasets/luisdiegofv97/imdb-dataset-of-50k-movie-reviews-spanish?select=IMDB+Dataset+SPANISH.csv

In [18]:
import pandas

In [22]:
# Load the imdb_dataset.csv
# Keep columns review_es, sentimiento. In sentimiento, replace "positivo" with 1 and "negativo" with 0
data = pandas.read_csv('imdb_dataset.csv')
data = data[['review_es', 'sentimiento']]
data.sentimiento = data.sentimiento.replace(['positivo', 'negativo'], [1, 0])


Unnamed: 0,review_es,sentimiento
0,Uno de los otros críticos ha mencionado que de...,1
1,Una pequeña pequeña producción.La técnica de f...,1
2,Pensé que esta era una manera maravillosa de p...,1
3,"Básicamente, hay una familia donde un niño peq...",0
4,"El ""amor en el tiempo"" de Petter Mattei es una...",1


In [None]:
# Save data in a new csv file
data.to_csv('imdb_dataset_processed.csv', index=False)

In [23]:
# Create x and y lists
x = data.review_es
y = data.sentimiento

# Print head 5 and len of both
print(data.head(5))
print(len(x))
print(len(y))


                                           review_es  sentimiento
0  Uno de los otros críticos ha mencionado que de...            1
1  Una pequeña pequeña producción.La técnica de f...            1
2  Pensé que esta era una manera maravillosa de p...            1
3  Básicamente, hay una familia donde un niño peq...            0
4  El "amor en el tiempo" de Petter Mattei es una...            1
50000
50000


In [31]:
# Create frequency distribution of the words in the reviews

def build_freqs(train_x, train_y):
    """
    Input:
        - train_x: the list of reviews
        - train_y: the list of labels
    Output:
        - freqs: a dictionary with the frequency of each word in the reviews

    Dictionary structure:
    {
        'word1': {
            'positive': int,
            'negative': int
    }
    """
    
    # Iterate thru the length of the array
    # Tokenize each review in train x

In [69]:
freqs = build_freqs(x, y)

In [71]:
# Show sorted list of words and their frequencies
print(sorted(freqs.items(), key=lambda x: x[1]['positive'] + x[1]['negative'], reverse=True)[:10])

[('pelicul', {'positive': 0, 'negative': 173675}), ('mas', {'positive': 0, 'negative': 57849}), ('the', {'positive': 0, 'negative': 53699}), ('si', {'positive': 0, 'negative': 40682}), ('hac', {'positive': 0, 'negative': 38019}), ('pued', {'positive': 0, 'negative': 35100}), ('sol', {'positive': 0, 'negative': 31429}), ('buen', {'positive': 0, 'negative': 30886}), ('ser', {'positive': 0, 'negative': 28230}), ('histori', {'positive': 0, 'negative': 27579})]


### Train and test data

In [42]:
# Create train and test datasets
train_x = x[:int(len(x) * 0.8)]
train_y = y[:int(len(y) * 0.8)]
test_x = x[int(len(x) * 0.8):]
test_y = y[int(len(y) * 0.8):]

# Print len of all
print(len(train_x))
print(len(train_y))
print(len(test_x))
print(len(test_y))

40000
40000
10000
10000


### Sigmoid, gradient descent, feature extraction, and logistic regression

In [47]:
# Sigmoid function
def sigmoid(x):
    """
    Input:
        - x: a number
    Output:
        - sigmoid: the sigmoid of x
    """
    sigmoid = 1 / (1 + np.exp(-x))

    return sigmoid

In [58]:
# Gradient descent
def gradient_descent(x, y, theta, alpha, iterations):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    '''
    m = x.shape[0]
    J_history = []
    for i in range(iterations):
        z = np.dot(x, theta)
        h = sigmoid(z)
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))  
        J_history.append(J)
        theta = theta - (alpha/m) * np.dot(x.transpose(), (h-y))
    J = float(J)
    return J_history, theta


In [66]:
# Feature extraction function
def feature_extraction(text, freqs):
    """
    Input:
        - word: a text
        - freqs: a dictionary with the frequency of each word in the reviews
    Output:
        - features: an array of (1, 3) with the following features:
            - 1 = 1
            - 2 = frequency of the tokens in the positive reviews
            - 3 = frequency of the tokens in the negative reviews
    """
    features = np.zeros((1, 3))
    features[0, 0] = 1
    tokens = process_text(text)
    for token in tokens:
        features[0][1] += freqs[token]['positive']
        features[0][2] += freqs[token]['negative']

    return features

In [67]:
# Test feature extraction with 5 random reviews
print('Feature extraction with 5 random reviews:')
idx = np.random.randint(0, len(x), 5)
for i in idx:
    print("Review:", x[i])
    # Print in scientific notation
    print("Features:", feature_extraction(x[i], freqs))

Feature extraction with 5 random reviews:
Review: Vi a Marigold en una vista previa que se muestra hace unos días, y encontré que era una película completamente absorbente y agradable. La película se trata de una actriz estadounidense no tan exitosa que va a la India para que actúe en una película de bajo presupuesto, solo para encontrarse trenzada allí cuando se encuentra a la llegada de que la financiación de la película ha desaparecido, junto con los productores e inversores. Un encuentro casual con una película india que disparara a la cercana la lleva a ser contratada por un pequeño papel de bailarina en eso. Dado que las películas indias incorporan una cantidad significativa de cantar y bailes, este es un problema para que Marigold, que tiene dos pies izquierdos, por no mencionar una personalidad tan estrechamente enrollada y espinosa que apenas puede escuchar la música, y mucho menos sentirlo, y mucho menos. Como Prem, el coreógrafo de la película, le ofrece a ella. Pero la pala

In [60]:
# Test sigmoid and gradient
print(sigmoid(0))
print(sigmoid(1))

tmp_x = np.append(np.ones((10, 1)), np.random.randn(10, 2), axis=1)
tmp_x
tmp_y = (np.random.randn(10, 1) > 0.35).astype(float)
tmp_y

tmp_J, tmp_theta = gradient_descent(tmp_x, tmp_y, np.zeros((3, 1)), 0.01, 1000)
print("Cost:", tmp_J[-1]) 
print("Theta:", tmp_theta)

0.5
0.7310585786300049
Cost: [[0.58333801]]
Theta: [[-0.63180767]
 [-0.15851034]
 [ 0.49948201]]
