In [1]:
import pandas as pd
import json
import re

In [2]:
# Load JSON data from file
with open('country_posts.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
# Initialize an empty list to store DataFrames
dfs = []

# Loop through each country (key) and normalize the data
for country, posts in data.items():
    # Normalize the data
    df = pd.json_normalize(posts)
    
    # Add a column for the country (this will help you identify the source)
    df['country'] = country
    
    # Append the DataFrame to the list
    dfs.append(df)

# Concatenate all DataFrames into one
final_df = pd.concat(dfs, ignore_index=True)

final_df

Unnamed: 0,caption,comment_count,date,like_count,play_count,country,location
0,🕌💖\n࿓ \n*\n*\n#BandarSeriBegawan #Brunei #文莱 #汶萊,12,2023-09-26 00:25:42+00:00,3,0,brunei,
1,"Bandar Seri Begawan, the capital of the tiny o...",182,2025-01-05 21:20:35+00:00,3569,0,brunei,
2,"Jerudong Park, Brunei, new year's eve 1996.\n\...",1,2024-08-08 04:37:02+00:00,103,0,brunei,
3,First time seeing a Unimog and it was quite th...,45,2023-08-24 16:36:23+00:00,635,0,brunei,
4,Flowers are always the way to a woman’s heart!...,0,2024-05-14 05:27:27+00:00,3,217,brunei,
...,...,...,...,...,...,...,...
6671,TOP16 IN POPPING HAND STYLE 🔫\n AT @radikalfor...,212,2024-01-11 07:08:13+00:00,1999,22522,vietnam,
6672,Слушать и наслаждаться ❤️\n\n#нячангвьетнам #ю...,6,2024-03-08 22:05:23+00:00,290,0,vietnam,
6673,"Ну как вам фото, которые с первого раза сделал...",25,2023-08-22 20:16:46+00:00,722,0,vietnam,
6674,✨MoonDog✨\n\nAmazing restaurant with the best ...,25,2022-02-28 16:17:40+00:00,3280,68071,vietnam,


In [4]:
# Convert 'date' column to datetime format
final_df['date'] = pd.to_datetime(final_df['date'])

# Split into separate 'date' and 'time' columns
final_df['date_only'] = final_df['date'].dt.date
final_df['time_only'] = final_df['date'].dt.time

# Display the updated DataFrame with new columns
print(final_df[['date', 'date_only', 'time_only']])

# # Extract day of the week (0 = Monday, 6 = Sunday)
# final_df['day_of_week'] = final_df['date'].dt.dayofweek

# # Extract month
# final_df['month'] = final_df['date'].dt.month

# # Extract year
# final_df['year'] = final_df['date'].dt.year

# # Extract whether the post was made on a weekend (1 if weekend, 0 if weekday)
# final_df['is_weekend'] = final_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# # Optionally, extract the weekday name (e.g., Monday, Tuesday)
# final_df['weekday_name'] = final_df['date'].dt.strftime('%A')

# # Display the updated DataFrame with new columns
# print(final_df[['date', 'date_only', 'time_only', 'day_of_week', 'month', 'year', 'is_weekend', 'weekday_name']])

                          date   date_only time_only
0    2023-09-26 00:25:42+00:00  2023-09-26  00:25:42
1    2025-01-05 21:20:35+00:00  2025-01-05  21:20:35
2    2024-08-08 04:37:02+00:00  2024-08-08  04:37:02
3    2023-08-24 16:36:23+00:00  2023-08-24  16:36:23
4    2024-05-14 05:27:27+00:00  2024-05-14  05:27:27
...                        ...         ...       ...
6671 2024-01-11 07:08:13+00:00  2024-01-11  07:08:13
6672 2024-03-08 22:05:23+00:00  2024-03-08  22:05:23
6673 2023-08-22 20:16:46+00:00  2023-08-22  20:16:46
6674 2022-02-28 16:17:40+00:00  2022-02-28  16:17:40
6675 2022-08-27 22:20:38+00:00  2022-08-27  22:20:38

[6676 rows x 3 columns]


In [5]:
def clean_text(text):
    # Remove emojis and other unwanted symbols
    text = re.sub(r'[^\w\s,.!?;]', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove extra spaces and trim
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Example usage
final_df['cleaned_caption'] = final_df['caption'].apply(clean_text)

# Display cleaned captions
print(final_df[['caption', 'cleaned_caption']])

                                                caption  \
0      🕌💖\n࿓ \n*\n*\n#BandarSeriBegawan #Brunei #文莱 #汶萊   
1     Bandar Seri Begawan, the capital of the tiny o...   
2     Jerudong Park, Brunei, new year's eve 1996.\n\...   
3     First time seeing a Unimog and it was quite th...   
4     Flowers are always the way to a woman’s heart!...   
...                                                 ...   
6671  TOP16 IN POPPING HAND STYLE 🔫\n AT @radikalfor...   
6672  Слушать и наслаждаться ❤️\n\n#нячангвьетнам #ю...   
6673  Ну как вам фото, которые с первого раза сделал...   
6674  ✨MoonDog✨\n\nAmazing restaurant with the best ...   
6675  地下の秘密壕から漂う過去の香り、期間限定の特別展示も~ 🇻🇳\n\nฅ^•ﻌ•^ฅ\n\n⠀...   

                                        cleaned_caption  
0                        BandarSeriBegawan Brunei 文莱 汶萊  
1     Bandar Seri Begawan, the capital of the tiny o...  
2     Jerudong Park, Brunei, new years eve 1996. Mic...  
3     First time seeing a Unimog and it was quite th...  
4

In [6]:
# final_df.describe()

In [None]:
from googletrans import Translator

def analyze_and_translate_languages_with_google(dataset):
    # Initialize the Google Translator
    translator = Translator()

    # Function to detect language using Google Translator
    def detect_language(text):
        try:
            return translator.detect(text).lang
        except Exception as e:
            return 'unknown'

    # Function to translate text to English
    def translate_to_english(text):
        try:
            return translator.translate(text, dest='en').text
        except Exception as e:
            return None

    # List to keep track of rows to drop
    to_drop = []

    # Iterate over the dataset and apply the logic for detecting and translating languages
    for index, row in dataset.iterrows():
        detected_language = detect_language(row["cleaned_caption"])

        if detected_language == 'en':
            dataset.at[index, "language"] = 'en'
            continue  # If already English, move to the next caption
        elif detected_language == 'unknown':
            to_drop.append(index)  # Drop the row if language is unknown
        else:
            # Translate to English
            translated_text = translate_to_english(row["cleaned_caption"])
            if translated_text:
                # Re-detect language after translation
                new_language = detect_language(translated_text)
                if new_language == 'en':
                    # Update with the translated text and mark the language as English
                    dataset.at[index, "cleaned_caption"] = translated_text
                    dataset.at[index, "language"] = 'en'
                else:
                    to_drop.append(index)  # Drop if still not English
            else:
                to_drop.append(index)  # Drop if translation failed

    # Drop rows where the language is still unknown or non-English
    dataset.drop(to_drop, inplace=True)

    return dataset  # Return the cleaned dataset with only English caption

# Example usage with your dataset
translated_caption = analyze_and_translate_languages_with_google(final_df)
print(translated_caption[['cleaned_caption', 'language']])

In [None]:
# do sentiment analysis and give sentiment scores using vader nlp
import nltk
# nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Apply sentiment analysis
translated_caption["sentiment_score"] = translated_caption["cleaned_caption"].apply(lambda text: analyzer.polarity_scores(text)["compound"])

# Display the results
print(translated_caption.head())

                                             caption  comment_count  \
1  Just wrapped up an epic core & full-body sessi...             22   
2  Случайные кадры с Бали 🌿 Какой кадр нравится б...             30   
3  ここ数年でバリ島ではトリュフチョコを扱うチョコレート専門店が増えましたが、こちらもその一つ！...              2   
4  Nggak heran lagi deh sama Hyoyeon kalo tiba-ti...             26   
5  📍Cretya Ubud, Bali 🇮🇩\n.\n.\n.\n#cretyaubud #c...             65   

                       date  like_count  play_count country location  \
1 2025-01-07 17:58:32+00:00        1259           0    bali      NaN   
2 2022-08-04 03:04:51+00:00         559           0    bali      NaN   
3 2023-04-29 20:40:00+00:00         559           0    bali      NaN   
4 2023-10-18 01:46:08+00:00        1985      120120    bali      NaN   
5 2024-12-07 11:59:29+00:00         936           0    bali      NaN   

    date_only time_only  day_of_week  month  year  is_weekend weekday_name  \
1  2025-01-07  17:58:32            1      1  2025           0 

In [None]:
translated_caption.to_json("sentiment_analysis(instagram).json", orient="records", date_format="iso")
