# Get English Posts

This notebook aims to extract posts in English language for particular time period.

It uses https://huggingface.co/papluca/xlm-roberta-base-language-detection in oder to determine the language.

In [None]:
DATA_PATH = os.environ['STORAGE'] + 'instagram/'
CAPTIONS_PATH = f"{DATA_PATH}/captions/raw/"
FILES = ["nyc_posts_2019.csv", "nyc_posts_2018.csv", "nyc_posts_2020.csv"]


In [14]:
import datetime
import time

In [7]:
import pandas as pd

In [13]:
df_nyc = pd.read_csv(CAPTIONS_PATH + FILES[0], sep=',')

Data slicing process. Timestamp column is shifted to +8 hours.

In [None]:
# '8 июня, 10-11 часов, 16-17 часов, 22-23 часа', '12 октября, также 10-11 часов, 16-17 часов, 22-23 часа', '4 февраля 11-13 и 22-23  часов'

In [291]:
year = 2019
month = 2
day = 4

In [292]:
hour_st = 11
hour_en = 11
date_time = datetime.datetime(year, month, day, hour_st+8, 0)
print(date_time)
start = time.mktime(date_time.timetuple())
date_time = datetime.datetime(year, month, day, hour_en+8, 30)
print(date_time)
end = time.mktime(date_time.timetuple())
data1 = df_nyc[(end > df_nyc.timestamp) &  ( df_nyc.timestamp > start)][['code', 'caption']]
data1.shape

2019-02-04 03:00:00
2019-02-04 03:30:00


(544, 2)

In [293]:
# 16-17 часов это 00-01  следующего дня
hour_st = 16
hour_en = 16
date_time = datetime.datetime(year, month, day, hour_st-8, 0)
print(date_time)
start = time.mktime(date_time.timetuple())
date_time = datetime.datetime(year, month, day, hour_en-8, 30)
print(date_time)
end = time.mktime(date_time.timetuple())
data2 = df_nyc[(end > df_nyc.timestamp) &  ( df_nyc.timestamp > start)][['code', 'caption']]
data2.shape

2019-02-04 08:00:00
2019-02-04 08:30:00


(78, 2)

In [294]:
hour_st = ( 22 + 8 ) % 24
hour_en = ( 22 + 8 ) % 24
#hour_st = 22
#hour_en = 22
date_time = datetime.datetime(year, month, day, hour_st, 0)
print(date_time)
start = time.mktime(date_time.timetuple())
date_time = datetime.datetime(year, month, day, hour_en, 30)
print(date_time)
end = time.mktime(date_time.timetuple())
data3 = df_nyc[(end > df_nyc.timestamp) &  ( df_nyc.timestamp > start)][['code', 'caption']]
data3.shape

2019-02-04 14:00:00
2019-02-04 14:30:00


(569, 2)

In [295]:
new_data = pd.concat([data1, data2, data3])

In [296]:
new_data = new_data.dropna(subset=['caption'])
new_data.shape

(1118, 2)

Data is ready, applying model to determine language

In [273]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

In [159]:
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")

In [160]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [178]:
import re

Threshold is set to 0.1, if the confidence of a model is lower, we set 'unk' as a mark.

In [207]:
def detect_modified(x):
    try:
        x = сlean_tags(x)
        if classifier(x)[0]['score'] < 0.1:
            return 'en'
        return classifier(x)[0]['label']
    except:
        return 'unk'

In [208]:
# deleting hashtags and mentions, uses only Latin letters (in order to delete emoji and other non-alphabetic symbols
# probably should be fixed to more clever way) and digits
def сlean_tags(x):
    x = re.sub(r'#[\S]*\s', ' ', x+' ')
    x = re.sub(r'@[\S]*\s', ' ', x)
    x = re.sub(r'[^A-Za-z0-9’]+', ' ', x)
    return x

Some weird examples found by testing

In [209]:
classifier('Happy Birthday Little Girl! 🎠'), classifier('Happy Birthday Little Girl!')

([{'label': 'hi', 'score': 0.39989757537841797}],
 [{'label': 'en', 'score': 0.7795630693435669}])

In [210]:
classifier('Cheers y’all'), classifier('Monday mood with these Capoeira beauties Outfit')

([{'label': 'es', 'score': 0.8983380794525146}],
 [{'label': 'pt', 'score': 0.9497151374816895}])

In [297]:
new_data['lang'] = new_data['caption'].apply(detect_modified)

In [298]:
new_data['no_tags'] = new_data['caption'].apply(сlean_tags)

In [299]:
new_data[new_data.lang != 'en'].shape, new_data[new_data.lang == 'en'].shape

((164, 4), (954, 4))

Examples of found non-english text to check quality by quick observation

In [300]:
new_data[new_data.lang != 'en'].head(50)[['lang', 'no_tags', 'caption']]

Unnamed: 0,lang,no_tags,caption
5743274,es,Los mejores momentos,Los mejores momentos
5743407,pt,Segunda parada junto a mi colega at dale ya,Segunda parada junto a mi colega @djcholo_elgr...
5743528,es,Nuevo ciclo 2019 en Aqui es donde nacen las pe...,Nuevo ciclo 2019 en @latintalentacademy 🏅🤸‍♀️ ...
5744859,nl,JCJ Vanderheyden Opening,JCJ Vanderheyden Opening @boersli_gallery #abo...
5745044,ur,PATRIOTS BABY,PATRIOTS BABY !!!🏈🏆🏈🏆🏈
5745101,fr,Saint Joseph Roman Catholic,Saint Joseph\nRoman Catholic
5745268,hi,party shmarty,party shmarty 🎉🍾
5745602,it,A mi me fascinan,A mi me fascinan 🥰
5745604,es,Moda color autenticidad y comodidad,"Moda, color, autenticidad y comodidad"
5745605,es,Me encantan comodidad moda cl sico talla 35 ha...,"Me encantan comodidad, moda, clásico.... talla..."


In [301]:
final_data = new_data[new_data.lang == 'en'][['code', 'caption']]

In [302]:
final_data.to_csv(f'nyc_{day}_{month}_2019_30min.csv')

As a result, mostly filtered data is non-English indeed, there are some exeptions but they do not affect a big role in event detection tasks. Thus, it was decided to not take them into account.