In [1]:
import pandas as pd
import numpy as np
import re
from transformers import pipeline
from transformers import AutoTokenizer

In [2]:
df = pd.read_json('/content/train.json')

In [3]:
# Removing outliers (filtering prices over the 99.9th percentile)
upper_bound = np.percentile(df["price"].values, 99.9)
df_filtered = df[df["price"] <= upper_bound]

In [4]:
# Preprocess descriptions (remove HTML tags, convert to lowercase)
def remove_html_tags(description):
    description = re.sub(r'<[^>]+>', '', description)
    description = re.sub(r'\s+', ' ', description)
    return description.strip()

In [5]:
df_filtered["description"] = df_filtered["description"].apply(remove_html_tags)
df_filtered["description"] = df_filtered["description"].apply(lambda desc: desc.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["description"] = df_filtered["description"].apply(remove_html_tags)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["description"] = df_filtered["description"].apply(lambda desc: desc.lower())


In [6]:
# Tokenizer initialization (using distilBERT model)
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
# Function to truncate text to fit the model input size
def truncate_text(text, max_length=512):
    if not text.strip():  # Check if the text is empty after removing spaces
        return " "
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        truncation=True
    )
    truncated_text = tokenizer.decode(inputs['input_ids'], skip_special_tokens=True)
    return truncated_text

In [8]:
# Truncate descriptions
df_filtered['truncated_description'] = df_filtered['description'].apply(truncate_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['truncated_description'] = df_filtered['description'].apply(truncate_text)


In [9]:
# Initialize sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", device=0)  # Using GPU if available

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [10]:
# Batch processing function
def get_sentiment_batch(texts):
    return sentiment_pipeline(texts)

In [11]:
# Set batch size for processing
batch_size = 16  # Adjust batch size based on GPU memory

In [12]:
# Initialize a list to store the results
sentiments = []

In [13]:
# Process data in batches
for i in range(0, len(df_filtered), batch_size):
    batch = df_filtered['truncated_description'][i:i + batch_size].tolist()  # Convert to list for batch processing
    sentiments_batch = get_sentiment_batch(batch)
    sentiments.extend(sentiments_batch)  # Add the results to the list

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [14]:
# Assign the results to the DataFrame
df_filtered['sentiment'] = sentiments

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sentiment'] = sentiments


In [15]:
# Sentiment classification function based on score thresholds
positive_threshold = 0.75
negative_threshold = 0.25

In [16]:
def classify_sentiment(sentiment):
    print(f"Processing sentiment: {sentiment}")  # Debugging line
    score = sentiment['score']
    if sentiment['label'] == 'POSITIVE' and score >= positive_threshold:
        return 1
    elif sentiment['label'] == 'NEGATIVE' and score <= negative_threshold:
        return -1
    elif sentiment['label'] == 'NEUTRAL':
        return 0  # Treat 'NEUTRAL' as 0
    else:
        return 0  # Default case for unexpected labels

In [17]:
# Apply sentiment classification
df_filtered['sentiment_label'] = df_filtered['sentiment'].apply(classify_sentiment)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9985800981521606}
Processing sentiment: {'label': 'NEGATIVE', 'score': 0.9986535310745239}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.5581045150756836}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9939087629318237}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9996242523193359}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9946309328079224}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9943278431892395}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9932077527046204}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9992578625679016}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9996932744979858}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.9988749623298645}
Processing sentiment: {'label': 'NEGATIVE', 'score': 0.9823254942893982}
Processing sentiment: {'label': 'POSITIVE', 'score': 0.7481

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['sentiment_label'] = df_filtered['sentiment'].apply(classify_sentiment)


In [19]:
# Display the first few rows to check the result
df_filtered.head(5)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,manager_id,photos,price,street_address,interest_level,truncated_description,sentiment,sentiment_label
4,1.0,1,8579a0b0d54db803821a35a4a615e97a,2016-06-16 05:55:27,spacious 1 bedroom 1 bathroom in williamsburg!...,145 Borinquen Place,"[Dining Room, Pre-War, Laundry in Building, Di...",40.7108,7170325,-73.9539,a10db4590843d78c784171a107bdacb4,[https://photos.renthop.com/2/7170325_3bb5ac84...,2400,145 Borinquen Place,medium,spacious 1 bedroom 1 bathroom in williamsburg!...,"{'label': 'POSITIVE', 'score': 0.8850623965263...",1
6,1.0,2,b8e75fc949a6cd8225b455648a951712,2016-06-01 05:44:33,brand new gut renovated true 2 bedroomfind you...,East 44th,"[Doorman, Elevator, Laundry in Building, Dishw...",40.7513,7092344,-73.9722,955db33477af4f40004820b4aed804a0,[https://photos.renthop.com/2/7092344_7663c19a...,3800,230 East 44th,low,brand new gut renovated true 2 bedroomfind you...,"{'label': 'POSITIVE', 'score': 0.998374342918396}",1
9,1.0,2,cd759a988b8f23924b5a2058d5ab2b49,2016-06-14 15:19:59,**flex 2 bedroom with full pressurized wall**l...,East 56th Street,"[Doorman, Elevator, Laundry in Building, Laund...",40.7575,7158677,-73.9625,c8b10a317b766204f08e613cef4ce7a0,[https://photos.renthop.com/2/7158677_c897a134...,3495,405 East 56th Street,medium,* * flex 2 bedroom with full pressurized wall ...,"{'label': 'POSITIVE', 'score': 0.9986716508865...",1
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,a brand new 3 bedroom 1.5 bath apartmentenjoy ...,Metropolitan Avenue,[],40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue,medium,a brand new 3 bedroom 1. 5 bath apartmentenjoy...,"{'label': 'NEGATIVE', 'score': 0.6298896074295...",0
15,1.0,0,bfb9405149bfff42a92980b594c28234,2016-06-28 03:50:23,over-sized studio w abundant closets. availabl...,East 34th Street,"[Doorman, Elevator, Fitness Center, Laundry in...",40.7439,7225292,-73.9743,2c3b41f588fbb5234d8a1e885a436cfa,[https://photos.renthop.com/2/7225292_901f1984...,2795,340 East 34th Street,low,over - sized studio w abundant closets. availa...,"{'label': 'NEGATIVE', 'score': 0.9978052973747...",0
