# Data engineering project : Tweet similar project 
### by Corentin DRAULT Julien BONNET

### Importing librairies

In [3]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import numpy as np
import nltk, string
from sklearn.feature_extraction.text import TfidfVectorizer

### Creating main functions

In [4]:
# Creating tokens with stemmer
def stem_tokens(tokens):
    return [stemmer.stem(item) for item in tokens]

# Text preprocessing
def normalize(text):
    return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))


# Using cosine similarity to output the similarity between 2 texts
def cosine_similarity(row):
    # couting number of times our input text words appears in a document
    tfidf = vectorizer.fit_transform([row.input, row.text])
    return ((tfidf * tfidf.T).A)[0,1]

# it's returning [0,1] numbers, is the positions in the matrix for the similarity since two text
# inputs will create a 2x2 symmetrical matrix.

In [8]:
def sorting_tweet (input_user,df): 
    
    # Introducing data in the dataframe as a new column
    df['input'] = input_user
    
    # Applying cosine_sim function to a new column 
    df['similarity'] = df.apply(cosine_similarity, axis=1)
    
    # Sorting dataframe by similarity highest value
    df = df.sort_values(by='similarity', ascending=False)
    
    # Reseting the index to get dataframe indexed after sorted
    df = df.reset_index(drop=True)
    
    # Creating output list appending the top 20 tweets
    output = []
    for i in range(0, 20): 
        output.append((df['text'][i])) 
        
    return output

In [5]:
# Create a new Porter stemmer.
stemmer = nltk.stem.porter.PorterStemmer()

# Removing punctuation
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

# Using Tfidf vectorizer
vectorizer = TfidfVectorizer(tokenizer=normalize, stop_words='english')

### Import dataset and prepare the dataframe

In [7]:
# Importing dataset
df = pd.read_csv('../tweets.csv', index_col=0)

# Drop useless column
df = df.drop(columns=['date', 'author','id','link','retweet'])

# Renaming index as "index"
df.index.name = "index"

# Showing dataframe
df.head(-5)

Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,Here is my statement.pic.twitter.com/WAZiGoQqMQ
1,Is this really America? Terrible!pic.twitter.c...
2,The media and establishment want me out of the...
3,Certainly has been an interesting 24 hours!
4,Debate polls look great - thank you!\n#MAGA #A...
...,...
17206,"""When the achiever achieves, it's not a platea..."
17207,"Enter the ""Think Like A Champion"" signed book ..."
17208,"""Strive for wholeness and keep your sense of w..."
17209,Listen to an interview with Donald Trump discu...


### Applying our function to all the dataset and get the output

In [16]:
web_input = "Hello guys, I love you"

# Sorting tweet output a list of top 20 similar tweets
output = sorting_tweet(web_input,df)


# Applying the function on our dataframe for the presentation

# Creating new column with web_input
df['input'] = web_input

# Applying cosine_sim function to a new column 
df['similarity'] = df.apply(cosine_similarity, axis=1)
    
# Sorting dataframe by similarity highest value
df = df.sort_values(by='similarity', ascending=False)
    
# Reseting the index to get dataframe indexed after sorted
df = df.reset_index(drop=True)

In [17]:
df.head(20)

Unnamed: 0,text,input,similarity
0,"""@UmerRizwan2: @realDonaldTrump Can i get a he...","Hello guys, I love you",0.315266
1,"""@Candynecklace2: @realDonaldTrump I love love...","Hello guys, I love you",0.295268
2,"""@DharmaBum77: Donald Trump loves America! He ...","Hello guys, I love you",0.270733
3,.@IamStevenT stopped by my office to say hello...,"Hello guys, I love you",0.269518
4,@BrikMillerEDG You will love it!,"Hello guys, I love you",0.260556
5,@RealKyleMorris You will love it!,"Hello guys, I love you",0.260556
6,.@williebosshog such an honor to get your endo...,"Hello guys, I love you",0.237739
7,Paul Teutul Sr. is a fantastic guy. Although I...,"Hello guys, I love you",0.225555
8,Paul Teutul Sr. is a fantastic guy. Although I...,"Hello guys, I love you",0.225555
9,"""I love America. And when you love something y...","Hello guys, I love you",0.21289


### Print the 20 Top similar tweets 

In [18]:
for i in range(0, 20): 
    print(output[i] + '\n')

"@UmerRizwan2: @realDonaldTrump Can i get a hello for all the Canadians down in Ottawa, Ontario ?"  Hello Umer, you sound like a great guy!

"@Candynecklace2: @realDonaldTrump I love love love Mr. Trump he is my hero " Thanks.

"@DharmaBum77: Donald Trump loves America! He loves Americans! He loves our Vets! #TrumpSupporters @realDonaldTrumppic.twitter.com/NgcoHRLj2M

.@IamStevenT stopped by my office to say hello- a great guy! http://bit.ly/1n1KP1Z 

@BrikMillerEDG You will love it!

@RealKyleMorris  You will love it!

.@williebosshog such an honor to get your endorsement. You are a fantastic guy! It will not be forgotten. Don and Eric say hello!

Paul Teutul Sr. is a fantastic guy. Although I fired him on #CelebApprentice, we will remain great friends. I love the bike he made for me.

Paul Teutul Sr. is a fantastic guy. Although I fired him on #CelebApprentice, we will remain great friends. I love the bike he made for me.

"I love America. And when you love something you protect it p