In [None]:
from core import *

In [1]:
import pandas as pd
import re
import string
import random
import numpy as np

import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag
from nltk import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def preprocess_post(text: str, lem: bool = True) -> str: 
    '''
    input:
    - text (str): input text to be processed.  
    - lem (bool): if True, apply lemmatization. if False, will not apply.  

    steps :
    1. convert the text to lowercase to standardize case.
    2. remove urls, user (@) and hashtags (#).
    3. strip html tags.
    4. remove emojis from the text.
    5. remove punctuation marks.
    6. eliminate extra whitespace.
    7. tokenize the text into individual words.
    8. remove stopwords to focus on meaningful words.
    9. apply stemming and lemmatization to reduce words to their base forms.

    output :
    - str: cleaned and normalized version of the input text.
    '''

    text = str(text)  
    # 1.
    text = text.lower()

    # 2. 
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+|#\w+', '', text)

    # 3.
    text = re.sub(r'<.*?>', '', text)

    # 4.
    text = emoji.replace_emoji(text, replace='')

    # 5. 
    text = text.translate(str.maketrans("", "", string.punctuation))

    # 6. 
    text = re.sub(r'\s+', ' ', text).strip()

    # 7. 
    words = word_tokenize(text)

    # 8.
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # 9. 
    # stemmer = PorterStemmer() could be used as well, less accurate for sentiment analysis
    if lem:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]
    
    #reconstruc text
    text = ' '.join(words)
    
    return text


In [4]:
path = 'reddit_database_sentiment.csv'
df = pd.read_csv(
    path,
    delimiter=';', 
    on_bad_lines='skip',  #ignores rows who are wrong formated
    header=0, 
    quotechar='"',  # deals with string which contains "
    encoding='utf-8',
    low_memory=False
)

FileNotFoundError: [Errno 2] No such file or directory: 'reddit_database_sentiment.csv'

In [9]:
# Selección de un post
post_prueba = df['post'][0]
post_prueba2 = df['post'][1]
print(post_prueba)
print(post_prueba2)

There's a lot of reasons to want to know all this stuff, so I figured I'd get to know the others that are on this subreddit.

So let's hear it: Webmasters? Coders? Marketers? Work for an analytics software company? You get the idea.
I'm cross posting this from /r/cyberlaw, hopefully you guys find it as interesting as I did(it deals with Google Analytics):

So quite awhile ago, I ordered a Papa John's pizza online. My job largely involves looking at ads that appear online, so afterwards I was quick to notice *I was getting a LOT* of Papa Johns ads (especially at night) being served through a Google owned company (DoubleClick media). Yesterday one of these ads popped up again on Youtube (a place that typically serves using the adwords program, not doubleclick), so I decided to copy the URL. 

For those not in the advertising field: Making full use of Google's analytics tool means that certain information about the advertising campaign is leaked in the URL.

So let's break it apart: 

&gt

In [10]:
# Prueba módulo 4
sentiment_analysis(post_prueba)

(0.125, 0.0, 6.875)
NEGATIVE
POSITIVE


In [11]:
# Prueba módulo 5
post_summarisation(post_prueba)

("There's a lot of reasons to want to know all this stuff, so I figured I'd get to know the others that are on this subreddit.",
 array([[1.        , 0.75087989],
        [0.75087989, 1.        ]]))

In [12]:
# Prueba módulo 6
texts_distance(post_prueba, post_prueba2)

('La distancia entre los dos textos indicados es: ', 0.7346749756775888)