In [32]:
# Installing dependencies
#!pip -q install transformers torch accelerate tqdm

# Importing dependencies
import re
import math
import torch
from tqdm.auto import tqdm
from transformers import pipeline
import pandas as pd
import numpy as np

In [33]:
# Reading the sample data
data_sample = pd.read_csv('../data/raw/sample/twcs_sample.csv')

## Creating target variable

In [34]:
# Converting tweet_id and in_response_to_tweet_id to Int64
data_sample['tweet_id'] = pd.to_numeric(
    data_sample['tweet_id'], errors='coerce'
).astype('Int64')

data_sample['in_response_to_tweet_id'] = pd.to_numeric(
    data_sample['in_response_to_tweet_id'], errors='coerce'
).astype('Int64')

# Creating a dictionary to map the relation between tweet_id and in_response_to_tweet_id
parent_of = dict(
    data_sample[['tweet_id', 'in_response_to_tweet_id']]
    .dropna().to_numpy()
)


# Root of conversation = last message of the conversation
root_cache = {}

def get_root(tid, max_steps=50):
    x = tid
    path = []
    steps = 0
    while True:
        if x in root_cache:
            root = root_cache[x]
            break
        parent = parent_of.get(x, np.nan)
        if pd.isna(parent) or parent not in parent_of:
            root = x
            break
        if steps >= max_steps:  # proteção a ciclos/threads longas
            root = x
            break
        path.append(x)
        x = parent
        steps += 1
    for p in path:
        root_cache[p] = root
    return root

tqdm.pandas(desc='Computando conversation_id')
data_sample['conversation_id'] = data_sample['tweet_id'].progress_apply(
    get_root
).astype('Int64')

Computando conversation_id: 100%|██████████| 100000/100000 [00:00<00:00, 508838.39it/s]


In [35]:
# Validation of the conversation_id column

tam = (data_sample.groupby('conversation_id')['tweet_id']
       .size().sort_values(ascending=False))
print('Big Threads:', tam.head(10).to_dict())

ex_conv = tam.index[0]
print('\nExample:')
cols = ['conversation_id', 'tweet_id', 'inbound', 'author_id',
        'in_response_to_tweet_id', 'created_at']
display(data_sample.loc[
    data_sample['conversation_id'] == ex_conv, cols
].sort_values('created_at').head(10))

Big Threads: {np.int64(1362465): 3, np.int64(604512): 3, np.int64(732556): 3, np.int64(1226215): 3, np.int64(1108746): 3, np.int64(525592): 3, np.int64(1053383): 3, np.int64(1034144): 3, np.int64(43407): 3, np.int64(858798): 3}

Example:


Unnamed: 0,conversation_id,tweet_id,inbound,author_id,in_response_to_tweet_id,created_at
66886,1362465,1362465,True,119972,1362466,Fri Oct 27 20:35:38 +0000 2017
90491,1362465,1362463,False,AmazonHelp,1362465,Fri Oct 27 20:38:00 +0000 2017
76581,1362465,1362464,True,119972,1362463,Fri Oct 27 20:40:00 +0000 2017


In [36]:
# Feature Engineering de Tempo de Atendimento
data_sample['created_at'] = pd.to_datetime(data_sample['created_at'], errors='coerce')

# Garante ordenação por tempo
data_sample_sorted = data_sample.sort_values(['conversation_id', 'created_at'])

# Duração total da conversa
conv_aggs = data_sample_sorted.groupby('conversation_id')['created_at'].agg(['min', 'max'])
conv_aggs['total_duration_minutes'] = (
    (conv_aggs['max'] - conv_aggs['min']).dt.total_seconds() / 60.0
)

# Tempo para primeira resposta
first_inbound = (
    data_sample_sorted[data_sample_sorted['inbound']]
    .groupby('conversation_id')['created_at'].first()
)
first_outbound = (
    data_sample_sorted[~data_sample_sorted['inbound']]
    .groupby('conversation_id')['created_at'].first()
)

# Junta as features de tempo
conv_features = pd.DataFrame(conv_aggs[['total_duration_minutes']])
conv_features = conv_features.merge(
    first_inbound.rename('first_inbound'),
    left_index=True, right_index=True, how='left'
)
conv_features = conv_features.merge(
    first_outbound.rename('first_outbound'),
    left_index=True, right_index=True, how='left'
)

# Calcula o tempo de resposta
conv_features['time_to_first_response_minutes'] = (
    (conv_features['first_outbound'] - conv_features['first_inbound'])
    .dt.total_seconds() / 60.0
)

# Remove valores negativos (possivelmente por dados incorretos)
conv_features.loc[conv_features['time_to_first_response_minutes'] < 0,
                  'time_to_first_response_minutes'] = np.nan

# Seleciona colunas finais e exibe estatísticas
conv_features = conv_features[
    ['total_duration_minutes', 'time_to_first_response_minutes']
]

print("Novas features de tempo (resumo):")
display(conv_features.describe())
print("\\nPrimeiras linhas das novas features:")
display(conv_features.head())


  data_sample['created_at'] = pd.to_datetime(data_sample['created_at'], errors='coerce')


Novas features de tempo (resumo):


Unnamed: 0,total_duration_minutes,time_to_first_response_minutes
count,98622.0,591.0
mean,16.62273,204.999351
std,3565.895,737.251264
min,0.0,0.516667
25%,0.0,4.925
50%,0.0,16.2
75%,0.0,76.558333
max,1114589.0,9958.483333


\nPrimeiras linhas das novas features:


Unnamed: 0_level_0,total_duration_minutes,time_to_first_response_minutes
conversation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.0,
12,0.0,
69,0.0,
84,0.0,
93,0.0,


In [38]:
# Normalize datetime
data_sample['created_at'] = pd.to_datetime(
    data_sample['created_at'], utc=True, errors='coerce'
)

# Detect text column
text_col = ['text']
# Detect conversation column
conv_col = 'conversation_id'

if conv_col not in data_sample.columns:
    raise ValueError('No conversation_id in data_sample.')


text_col = ['text']

try:
    text_col
except NameError:
    text_col = None

if isinstance(text_col, (list, tuple, np.ndarray, pd.Index)):
    text_col = next(
        (c for c in text_col if c in data_sample.columns),
        None
    )

if not isinstance(text_col, str) or text_col not in data_sample.columns:
    text_col = next(
        (c for c in text_col if c in data_sample.columns),
        None
    )

if text_col is None:
    raise ValueError('No text column in data_sample.')

print('Using columns:',
      {'conv_col': conv_col, 'text_col': text_col})

Using columns: {'conv_col': 'conversation_id', 'text_col': 'text'}


In [39]:
# Using the last tweet of the conversation
data_sample['created_at'] = pd.to_datetime(
    data_sample['created_at'], utc=True, errors='coerce'
)

df_cli = data_sample.loc[
    data_sample['inbound'] == True,
    [conv_col, 'tweet_id', 'created_at', text_col]
].copy()

df_cli = df_cli.sort_values([conv_col, 'created_at'])
idx_last = df_cli.groupby(conv_col)['created_at'].idxmax()
ultimo_cliente = df_cli.loc[idx_last].reset_index(drop=True)

ultimo_cliente['text_clean'] = (
    ultimo_cliente[text_col].astype(str)
    .str.replace(r'\s+', ' ', regex=True).str.strip()
)

ultimo_cliente = ultimo_cliente.loc[
    ultimo_cliente['text_clean'].str.len() > 0
].reset_index(drop=True)

ultimo_cliente.head(3)

Unnamed: 0,conversation_id,tweet_id,created_at,text,text_clean
0,12,12,2017-10-31 22:04:47+00:00,@sprintcare You gonna magically change your co...,@sprintcare You gonna magically change your co...
1,69,69,2017-10-31 22:03:32+00:00,@ChipotleTweets messed up today and didn’t giv...,@ChipotleTweets messed up today and didn’t giv...
2,84,84,2017-11-01 01:23:03+00:00,@ChipotleTweets are they supposed to charge pp...,@ChipotleTweets are they supposed to charge pp...


In [40]:
# Merge com as features de tempo
ultimo_cliente = ultimo_cliente.merge(
    conv_features,
    left_on='conversation_id',
    right_index=True,
    how='left'
)

print("Colunas após o merge:")
print(ultimo_cliente.columns)
display(ultimo_cliente[['total_duration_minutes', 'time_to_first_response_minutes']].describe())


Colunas após o merge:
Index(['conversation_id', 'tweet_id', 'created_at', 'text', 'text_clean',
       'total_duration_minutes', 'time_to_first_response_minutes'],
      dtype='object')


Unnamed: 0,total_duration_minutes,time_to_first_response_minutes
count,54598.0,591.0
mean,30.01029,204.999351
std,4792.531,737.251264
min,0.0,0.516667
25%,0.0,4.925
50%,0.0,16.2
75%,0.0,76.558333
max,1114589.0,9958.483333


In [41]:

# Loading sentiment analysis pipeline
device = 0 if torch.cuda.is_available() else -1
sentiment = pipeline(
    task='sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    device=device
)

Device set to use cpu


In [43]:
# Inferring sentiment in batches
def batched_idx(index, batch_size=32):
    arr = index.to_list()
    for i in range(0, len(arr), batch_size):
        yield arr[i:i+batch_size]

preds = []
indices = ultimo_cliente.index
for chunk in tqdm(batched_idx(indices, 32), total=math.ceil(len(indices)/32)):
    texts = ultimo_cliente.loc[chunk, 'text_clean'].tolist()
    out = sentiment(texts, truncation=True)
    preds.extend(out)

pred_df = pd.DataFrame(preds, index=indices)
ultimo_cliente['sent_label'] = pred_df['label']
ultimo_cliente['sent_score'] = pred_df['score']

# Extrai dígito de 'X star(s)'
ultimo_cliente['sent_stars'] = (
    ultimo_cliente['sent_label'].str.extract(r'(\d)').astype(int)
)

ultimo_cliente[['tweet_id', 'sent_label', 'sent_stars']].head(5)

100%|██████████| 1707/1707 [58:39<00:00,  2.06s/it] 


Unnamed: 0,tweet_id,sent_label,sent_stars
0,12,5 stars,5
1,69,1 star,1
2,84,1 star,1
3,93,3 stars,3
4,149,1 star,1


In [44]:
# Creating target variables (Binary (0 (Unsatisfied), 1 (Satisfied), NaN (Neutral)), Multiclass (Insatisfied, Neutral, Satisfied))
def map_bin(stars):
    if stars <= 2:
        return 0
    if stars >= 4:
        return 1
    return np.nan

ultimo_cliente['target_bin'] = ultimo_cliente['sent_stars'].apply(map_bin)

ultimo_cliente['target_multi'] = ultimo_cliente['sent_stars'].map({
    1: 'Unsatisfied',
    2: 'Unsatisfied',
    3: 'Neutral',
    4: 'Satisfied',
    5: 'Satisfied'
})

# Conjunto para treino binário (sem neutros)
train_bin = ultimo_cliente.dropna(subset=['target_bin']).copy()
train_bin['target_bin'] = train_bin['target_bin'].astype(int)

print('Starts distribution:')
print(ultimo_cliente['sent_stars'].value_counts().sort_index())

print('\nTarget_bin distribution (0/1, sem 3):')
print(train_bin['target_bin'].value_counts())

Starts distribution:
sent_stars
1    36132
2     1926
3     3968
4     1744
5    10828
Name: count, dtype: int64

Target_bin distribution (0/1, sem 3):
target_bin
0    38058
1    12572
Name: count, dtype: int64


In [45]:
# Atualizando o bin_train com as novas features
# O `ultimo_cliente` já possui as colunas de tempo
# Recriamos `bin_train` para garantir que ele também as tenha.
bin_train = ultimo_cliente.dropna(subset=['target_bin']).copy()
bin_train['target_bin'] = bin_train['target_bin'].astype(int)

print("Verificação das colunas em `bin_train`:")
print(bin_train.info())


Verificação das colunas em `bin_train`:
<class 'pandas.core.frame.DataFrame'>
Index: 50630 entries, 0 to 54597
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   conversation_id                 50630 non-null  Int64              
 1   tweet_id                        50630 non-null  Int64              
 2   created_at                      50630 non-null  datetime64[ns, UTC]
 3   text                            50630 non-null  object             
 4   text_clean                      50630 non-null  object             
 5   total_duration_minutes          50630 non-null  float64            
 6   time_to_first_response_minutes  539 non-null    float64            
 7   sent_label                      50630 non-null  object             
 8   sent_score                      50630 non-null  float64            
 9   sent_stars                      50630 non-null  

In [46]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # URLs
    text = re.sub(r"@\w+", "", text)     # mentions
    text = re.sub(r"#\w+", "", text)     # hashtags
    text = re.sub(r"\s+", " ", text)     # extra spaces
    return text.strip()

ultimo_cliente['text_clean'] = ultimo_cliente['text_clean'].apply(clean_text)
bin_train['text_clean'] = bin_train['text_clean'].apply(clean_text)

In [47]:
#Saving results
import fastparquet



ultimo_cliente.to_parquet('../data/preprocessed/client_tweets.parquet', index=False, engine='fastparquet')
train_bin.to_parquet('../data/preprocessed/train_bin.parquet', index=False, engine='fastparquet')
stats = {
    "n_observations_original": len(data_sample),
    "n_observations_final": len(ultimo_cliente),
    "distribution_sent_stars": ultimo_cliente['sent_stars'].value_counts().to_dict()
}

import json
with open("../data/preprocessed/eda_stats.json", "w") as f:
    json.dump(stats, f, indent=2)


## Combinação de Features e Preparação Final


In [48]:
# Installing dependencies
%pip install -q sentence-transformers
%pip install -q scikit-learn

# Importing dependencies
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sentence_transformers import SentenceTransformer
import os

# Ensuring output directories exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../data/preprocessed', exist_ok=True)


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [49]:
# Selecting the DataFrame for final preprocessing
# We will use `bin_train`, which does not contain null values in the target variable

final_df = bin_train.copy()

# Defining columns by type
numeric_features = [
    'total_duration_minutes', 'time_to_first_response_minutes', 'sent_score'
]
text_feature = 'text_clean'
target_col = 'target_bin'
id_cols = ['conversation_id', 'tweet_id']

# Pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# ColumnTransformer to apply different transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ],
    remainder='passthrough'  # Keep other columns (text, id, etc.)
)

# Apply preprocessing to structured features
structured_features_processed = preprocessor.fit_transform(final_df)

processed_feature_names = numeric_features

print(f"Structured features processed. Shape: {structured_features_processed.shape}")


Structured features processed. Shape: (50630, 12)


In [50]:
# Loading the model for text embeddings
# Using a multilingual model for robustness
model_name = 'distiluse-base-multilingual-cased-v1'
text_embedder = SentenceTransformer(model_name)

# Generating text embeddings

print("Generating text embeddings...")
text_embeddings = text_embedder.encode(
    final_df[text_feature].tolist(),
    show_progress_bar=True
)
print(f"Text embeddings generated. Shape: {text_embeddings.shape}")

# O `ColumnTransformer` manteve as colunas restantes no final do array.
# Do `structured_features_processed`, queremos apenas as partes numéricas.
structured_only_processed = structured_features_processed[:, :len(processed_feature_names)]

# Combinando features estruturadas e embeddings de texto
final_features_array = np.hstack([
    structured_only_processed,
    text_embeddings
])

# Criando o DataFrame final
embedding_feature_names = [f'emb_{i}' for i in range(text_embeddings.shape[1])]
final_column_names = processed_feature_names + embedding_feature_names

preprocessed_df = pd.DataFrame(final_features_array, columns=final_column_names)

# Adicionando de volta os IDs e a variável alvo
preprocessed_df[id_cols] = final_df[id_cols].values
preprocessed_df[target_col] = final_df[target_col].values

print("\\nFinal preprocessed DataFrame (first rows):")
display(preprocessed_df.head())

# --- Saving artifacts ---

# 1. Save the ColumnTransformer
preprocessor_path = '../models/preprocessor.joblib'
joblib.dump(preprocessor, preprocessor_path)
print(f"\\nPreprocessor saved in: {preprocessor_path}")

# The SentenceTransformer model is loaded from the Hub, so we don't need to save it
# locally, just remember the `model_name`.

# 2. Save the final DataFrame
output_path = '../data/preprocessed/preprocessed_customer_data.parquet'
preprocessed_df.to_parquet(output_path, index=False)
print(f"Final preprocessed DataFrame saved in: {output_path}")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Generating text embeddings...


Batches: 100%|██████████| 1583/1583 [17:05<00:00,  1.54it/s]


Text embeddings generated. Shape: (50630, 512)
\nFinal preprocessed DataFrame (first rows):


Unnamed: 0,total_duration_minutes,time_to_first_response_minutes,sent_score,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,...,emb_505,emb_506,emb_507,emb_508,emb_509,emb_510,emb_511,conversation_id,tweet_id,target_bin
0,-0.006434,-0.025527,-0.134344,-0.025764,0.031075,0.005968,0.013011,-0.042477,-0.063932,-0.032378,...,0.014949,0.009421,0.006726,-0.01401,0.047852,-0.039484,-0.058814,12,12,1
1,-0.006434,-0.025527,-0.073158,-0.027546,-0.044466,-0.020651,0.033756,-0.018774,-0.021676,-0.008755,...,-0.006713,-0.028638,-0.037573,-0.027671,-0.040251,0.056657,0.051989,69,69,0
2,-0.006434,-0.025527,-0.098847,0.002533,0.045608,0.032789,0.033059,-0.020575,-0.043776,-0.067642,...,-0.020298,-0.035196,0.061651,-0.064578,0.058077,0.056611,0.023738,84,84,0
3,-0.006434,-0.025527,0.566254,0.015077,-0.02334,0.035434,-0.021939,-0.009099,-0.026524,0.003372,...,0.069496,0.062662,-0.00727,-0.002056,0.050734,0.030722,-0.034816,149,149,0
4,-0.006434,-0.025527,-0.21144,-0.032982,0.013207,0.021564,0.007199,-0.021147,-0.008958,-0.001474,...,-5.8e-05,0.004172,-0.086024,-0.025065,0.069067,0.002457,-0.048573,208,208,0


\nPreprocessor saved in: ../models/preprocessor.joblib
Final preprocessed DataFrame saved in: ../data/preprocessed/preprocessed_customer_data.parquet
