# Tweet scraping

In [None]:
#!pip install pandas
#!pip install snscrape

In [None]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

In [None]:
#technique 1: list of words in language opposite to lang
#TODO: estaria bé que la cerca la fes en el tuit net
def give_me_tweets1(max_num, word_list, language):
    text=' OR '.join(word_list)
    tweets_list=[]
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(text+' lang:'+language).get_items()):
        if i>=max_num:
            break
        tweets_list.append([tweet.date, tweet.lang, tweet.content])#tweet.id, tweet.user.username

    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Language', 'Text'])
    return tweets_df

In [None]:
give_me_tweets1(10, ['oye', 'bueno', 'pues'], 'ca')

#idees per ca: vamos, bueno, hombre, venga, oye
#idees per es: adéu, conya

In [None]:
#technique 2: words from different languages in same tweet
def give_me_tweets2(max_num, word_list):
    
    text=' AND '.join(word_list)
    
    tweets_list=[]
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(text).get_items()):
        if i>max_num:
            break
        tweets_list.append([tweet.date, tweet.lang, tweet.content, tweet.url])#tweet.id, tweet.user.username

    tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Language', 'Text', 'URL'])
    print(tweets_df)

In [None]:
give_me_tweets2(10, ['adéu', 'adiós'])

# Language detection tests

## GCLD3

In [None]:
#identify catalan-spanish mixed tweets

!pip install gcld3

In [None]:
import gcld3

In [None]:
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)

In [None]:
sample = ("Buenos días, amigos. Cómo estáis? Estoy hablando español.")
results = detector.FindTopNMostFreqLangs(text=sample, num_langs=2)

In [None]:
for i in results:
    print(i.language, i.is_reliable, i.proportion, i.probability)

## Polyglot

In [None]:
#!pip install polyglot
#!pip install morfessor

In [None]:
import polyglot

from polyglot.text import Text, Word

In [None]:
text = Text("Bonjour, Mesdames.")
print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))

In [None]:
from polyglot.detect import Detector

mixed_text = "Hello! How are you? Bon dia! Com esteu?"

for language in Detector(mixed_text).languages:
        print(language)

## Langdetect

In [None]:
#!pip install langdetect

In [None]:
from langdetect import detect, detect_langs
result=detect_langs("Hola, cómo estáis, amigos? Sois tontos. Adéu, amics i amigues.")

result

In [None]:
def is_mixed(text):
    if ('es:' in str(detect_langs(text))) and ('ca:' in str(detect_langs(text))):
        return True
    else:
        return False 

In [None]:
is_mixed("Hola, cómo estáis, amigos? Sois tontos. Adéu, amics i amigues.")

## Fasttext

In [None]:
#!pip install fasttext

In [None]:
import fasttext
PRETRAINED_MODEL_PATH = './lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)

In [None]:
sentences = ['War does not show who is right, just who is left. Hola, em dic Berta i soc de Barcelona. He estudiat filosofia i he fet un doctorat de merda.']
predictions = model.predict(sentences)
predictions

## Spacy

## Langid

## Pycld2

In [None]:
!pip install pycld2

In [None]:
import pycld2 as cld2

In [None]:
fr_en_Latn = """\
France is the largest country in Western Europe. Em dic Armand, vaig nèixer a Barcelona."""

isReliable, textBytesFound, details, vectors = cld2.detect(
    fr_en_Latn, returnVectors=True
)

print(vectors)
print(details)
print(textBytesFound)
print(isReliable)

#### Conclusion thus far: Langdetect is the best tool I've tried.

In [None]:
#I use Langdetect to filter tweets found near Barcelona

tweets_list=[]
for i,tweet in enumerate(sntwitter.TwitterSearchScraper("near:BCN within:15mi").get_items()):
    if i>=100:
        break
    if len(tweet.content)>0 and is_mixed(tweet.content)==True:
        tweets_list.append([tweet.date, tweet.content])#tweet.id, tweet.user.username

tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Text'])

In [None]:
pd.set_option('display.max_colwidth', None)
print(tweets_df['Text'])

### I'm gonna try to clean the tweets first

In [None]:
#!pip install twitter-text-python

In [None]:
from ttp import ttp

p = ttp.Parser()
result = p.parse("@burnettedmond, you now support #IvoWertzel's tweet parser! https://github.com/edmondburnett/")
#print(result.reply)
#print(result.users)
#print(result.tags)
#print(result.urls)

tweet="@burnettedmond, you now support #IvoWertzel's tweet parser! https://github.com/edmondburnett/"
clean_tweet=tweet.replace(result.reply, '')
for item in result.users:
    clean_tweet=tweet.replace('@'+item, '')
#for item in result.tags:
#    clean_tweet=clean_tweet.replace(item, '')
for item in result.urls:
    clean_tweet=clean_tweet.replace(item, '')
    
clean_tweet

In [None]:
#!pip install emoji
import re

In [None]:
def remove_emoji(text):
    emoji_pattern = re.compile("["
   u"\U0001F600-\U0001F64F"  # emoticons
                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                u"\U00002702-\U000027B0"
                u"\U000024C2-\U0001F251"
                u"\U0001f926-\U0001f937"
                u'\U00010000-\U0010ffff'
                u"\u200d"
                u"\u2640-\u2642"
                u"\u2600-\u2B55"
                u"\u23cf"
                u"\u23e9"
                u"\u231a"
                u"\u3030"
                u"\ufe0f"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
def quick_clean(tweet_text):
    p = ttp.Parser()
    result = p.parse(tweet_text)
    for item in result.users:
        tweet_text=tweet_text.replace('@'+item, '')
    for item in result.urls:
        tweet_text=tweet_text.replace(item, '')
    tweet_text=remove_emoji(tweet_text)
    return tweet_text

### Combine both functionalities:

In [None]:
tweets_list=[]
for i,tweet in enumerate(sntwitter.TwitterSearchScraper("near:BCN within:15mi").get_items()):
    if i>=1000:
        break
    tweet_text=quick_clean(tweet.content)
    #print(i, tweet_text)
    try:
        if len(tweet_text)>10 and is_mixed(tweet_text)==True:
            tweets_list.append([tweet.date, tweet_text])#tweet.id, tweet.user.username
    except:
        print('No language features:', tweet_text)

tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Text'])

In [None]:
pd.set_option('display.max_colwidth', None)
tweets_df

In [None]:
#tweets de la juana dolores (o altres usuaris que tendeixen a fer code mixing)

tweets_list=[]
for i,tweet in enumerate(sntwitter.TwitterSearchScraper("from:juanadolorex").get_items()):
    if i>=1000:
        break
    tweet_text=quick_clean(tweet.content)
    #print(i, tweet_text)
    try:
        if len(tweet_text)>10 and is_mixed(tweet_text)==True:
            tweets_list.append([tweet.date, tweet_text])#tweet.id, tweet.user.username
    except:
        print('No language features:', tweet_text)

tweets_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Text'])

In [None]:
tweets_df

## Tweets amb certs hashtags


In [None]:
give_me_tweets1(10, ['#lol'], 'ca') #altres: #ironia, #no. Sobretot si estan al final del tweet.