In [None]:
# Trying to see if this makes a change

In [None]:
# Mounting Google drive to colab

from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/

In [4]:
!git commit -a -m "Changes are committed to GitHub" 
!git push

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date


In [2]:
# Reading the JSON file and converting it into pandas DataFrame (df)

import pandas as pd
import json

input_file = 'digitalnomad_comments.json'

def read_json(filename):
    with open(filename, 'r') as f:
        for line in f:
            yield json.loads(line)

# Create the DataFrame using the generator
df = pd.DataFrame(read_json(input_file))

# Display the DataFrame
print(df)

print('DataFrame Properties:')
print(type(df))

 # Count the keys (which will become column names in a DataFrame)
num_keys = len(df.keys())
print("Number of keys (future column names):", num_keys)

# Print the keys (future column names)
print("Keys (future column names):", list(df.keys()))

print(df.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'digitalnomad_comments.json'

In [None]:
# Dropping the unnecessary columns to reduce memory/computing usage

import sys

# Before dropping columns
size_before = df.memory_usage(index=True, deep=True).sum()
print(f"DataFrame size before dropping columns: {size_before} bytes")

# Drop all columns except 'body'
df = df[['body']]

# After dropping columns
size_after = df.memory_usage(index=True, deep=True).sum()
print(f"DataFrame size after dropping columns: {size_after} bytes")

# Calculate reduction
reduction = size_before - size_after
print(f"Memory usage reduced by {reduction} bytes")

# Let's change the name of our column from 'body' to 'text'
df.rename(columns={'body': 'text'}, inplace=True)

# Print the updated df and its properties
print('DataFrame Properties:')
print(type(df))

num_keys = len(df.keys())
print("Number of keys (future column names):", num_keys)

print("Keys (future column names):", list(df.keys()))
print(df.shape)
print(df)

DataFrame size before dropping columns: 261266872 bytes
DataFrame size after dropping columns: 261266872 bytes
Memory usage reduced by 0 bytes
DataFrame Properties:
<class 'pandas.core.frame.DataFrame'>
Number of keys (future column names): 1
Keys (future column names): ['text']
(745868, 1)
                                                     text
0                                                      Hi
1                                                     Hi!
2       What business are you in? And where are you now? 
3       Tech support for Hostgator in Houston Texas. I...
4       Cool! So what's your plan to make money while ...
...                                                   ...
745863  I'd hazard that many if not the majority of sa...
745864                           India, Thailand, Morocco
745865  Programming will be one of the last jobs to be...
745866              BA easily. Summer, way more fun, etc.
745867  6 weeks is the sweet spot for me. \n\nIt reall...

[745868 row

In [None]:
# Preprocessing the data before BERT embeddings

import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Text cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)      # Remove digits
    text = re.sub(r'\n', ' ', text)      # Remove new lines
    return text

# Apply text cleaning
df['clean_text'] = df['text'].apply(clean_text)

# Stop word removal (using default NLTK stop words + custom words)
stop_words = set(stopwords.words('english'))

# Add your custom stop words
custom_stop_words = {'yeah', 'yep', 'yes','mmhmm', 'agree', 'ive', 'interviewee', 'laugh'}
stop_words.update(custom_stop_words)

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

# Now apply the function
df['clean_text'] = df['clean_text'].apply(remove_stop_words)

# Lemmatization function
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())

# Apply lemmatization
df['clean_text'] = df['clean_text'].apply(lemmatize_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertModel, pipeline
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# BERT embedding extraction
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts):
    embeddings = []
    for text in texts:
        # Explicitly set max_length and truncation
        inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.vstack(embeddings)

document_vectors = get_bert_embeddings(df['clean_text'].tolist())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print('done with the long task')

done with the long task


In [None]:
# Clustering
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Consider integrating PCA here for dimension reduction (not a must, can be beneficial)

num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)  # Added n_init
df['cluster'] = kmeans.fit_predict(document_vectors)

# Evaluate clustering
silhouette_avg = silhouette_score(document_vectors, df['cluster'])
print(f"Silhouette Score: {silhouette_avg}")

# Sentiment analysis
sentiment_pipeline = pipeline("sentiment-analysis")

def analyze_sentiment(texts):
    results = []
    for text in texts:
        # Truncate text before tokenization to ensure it fits within BERT's limit
        tokens = tokenizer.tokenize(text)
        if len(tokens) > 510: # 512 minus [CLS] and [SEP] tokens
            truncated_text = tokenizer.convert_tokens_to_string(tokens[:510])
        else:
            truncated_text = text
        results.append(sentiment_pipeline(truncated_text)[0]['label'])
    return results

df['sentiment'] = analyze_sentiment(df['clean_text'])


# Aggregate sentiment by cluster
import pandas as pd

sentiment_summary = df.groupby('cluster')['sentiment'].value_counts(normalize=True).unstack().fillna(0)
print(sentiment_summary)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

# t-SNE visualization of clusters
tsne = TSNE(n_components=2, random_state=42)
reduced_data = tsne.fit_transform(document_vectors)

plt.figure(figsize=(10, 7))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=df['cluster'], cmap='viridis', alpha=0.5)
plt.colorbar(label='Cluster')
plt.title("t-SNE Visualization of Clusters")
plt.show()

# Sentiment distribution by cluster
plt.figure(figsize=(10, 6))
sns.countplot(x='cluster', hue='sentiment', data=df)
plt.title("Sentiment Distribution by Cluster")
plt.show()

# Examine Top Terms in Each Cluster
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

def get_top_terms_per_cluster(df, n_terms=10):
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['clean_text'])
    feature_names = np.array(vectorizer.get_feature_names_out())

    top_terms = {}
    for cluster in df['cluster'].unique():
        cluster_texts = df[df['cluster'] == cluster]['clean_text']
        cluster_X = vectorizer.transform(cluster_texts)
        sum_words = cluster_X.sum(axis=0)
        words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
        top_terms[cluster] = words_freq[:n_terms]
    return top_terms

top_terms = get_top_terms_per_cluster(df)
for cluster, terms in top_terms.items():
    print(f"Cluster {cluster}:")
    print(", ".join([f"{term[0]} ({term[1]})" for term in terms]))
    print()

# Sample Texts from Each Cluster
def sample_texts_by_cluster(df, n_samples=5):
    samples = {}
    for cluster in df['cluster'].unique():
        cluster_texts = df[df['cluster'] == cluster]['clean_text']  # Ensure column name is correct
        samples[cluster] = cluster_texts.sample(n=n_samples, random_state=42).tolist()
    return samples

cluster_samples = sample_texts_by_cluster(df)
for cluster, texts in cluster_samples.items():
    print(f"Cluster {cluster} samples:")
    for text in texts:
        print(f"- {text}")
    print()

# Cluster Profiling
def profile_clusters(df):
    cluster_profile = df.groupby('cluster').agg({
        'clean_text': lambda texts: ' '.join(texts),
        'sentiment': lambda sentiments: sentiments.mode()[0]
    }).reset_index()
    return cluster_profile

cluster_profiles = profile_clusters(df)
print(cluster_profiles)