# This is just a quick helper notebook to give both datasets same names in the columns

In [1]:
# Essential imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
from pathlib import Path
import os
from dotenv import load_dotenv
from openai import OpenAI

# Configure pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Set plotting style
sns.set_theme()  # This is the correct way to set seaborn style
plt.rcParams['figure.figsize'] = (12, 8)

In [2]:
# Data Import 

# Define the data paths for both datasets
DATA_PATH_1 = "../data/SSOT_manual_LB_20250808_120908.csv" # ⬅️ Change this path if needed
DATA_PATH_2 = "../data/SSOT_manual_BM_20250707_150309.csv" # ⬅️ Change this path if needed

# Load the first dataset (df1)
try:
    df_LB = pd.read_csv(DATA_PATH_1)
    print(f"✓ First dataset loaded successfully")
    print(f"✓ Shape of dataset 1: {df_LB.shape}")
except FileNotFoundError:
    print("❌ Error: The file LB dataset was not found in the data directory")
except Exception as e:
    print(f"❌ Error loading the first dataset: {str(e)}")

# Load the second dataset (df2)
try:
    df_BM = pd.read_csv(DATA_PATH_2)
    print(f"\n✓ Second dataset loaded successfully")
    print(f"✓ Shape of dataset 2: {df_BM.shape}")
except FileNotFoundError:
    print("❌ Error: The file df_BM was not found in the data directory")
except Exception as e:
    print(f"❌ Error loading the second dataset: {str(e)}")

# Display basic information about both datasets
print("\nFirst few rows of dataset 1:\n")
display(df_LB.head())

print("\nFirst few rows of dataset 2:\n")
display(df_BM.head())

✓ First dataset loaded successfully
✓ Shape of dataset 1: (3944, 15)

✓ Second dataset loaded successfully
✓ Shape of dataset 2: (917, 13)

First few rows of dataset 1:



Unnamed: 0,ID,abstract,acmid,author,doi,outlet,title_full,url,year,qualtrics_id,wos_id,ebsco_id,stage_1,stage_2,stage_3
0,Bindu2018503,Online social networks have become immensely p...,,"Bindu, P V and Mishra, R and Thilagam, P S",10.1007/s10844-017-0494-z,Journal of Intelligent Information Systems,{Discovering spammer communities in TWITTER},https://www.scopus.com/inward/record.uri?eid=2...,2018,12,,,True,False,False
1,Moraga2018470,This article explores the ways Latinos—as audi...,,"Moraga, J E",10.1177/0193723518797030,Journal of Sport and Social Issues,"{On ESPN Deportes: Latinos, Sport MEDIA, and t...",https://www.scopus.com/inward/record.uri?eid=2...,2018,22,,,True,False,False
2,Lanosga20181676,This study of American investigative reporting...,,"Lanosga, G and Martin, J",10.1177/1464884916683555,JOURNALISm,"{JOURNALISts, sources, and policy outcomes: In...",https://www.scopus.com/inward/record.uri?eid=2...,2018,47,,,True,False,True
3,Warner2018720,"In this study, we test the indirect and condit...",,"Warner, B R and Jennings, F J and Bramlett, J ...",10.1080/15205436.2018.1472283,Mass Communication and Society,{A MultiMEDIA Analysis of Persuasion in the 20...,https://www.scopus.com/inward/record.uri?eid=2...,2018,50,,,True,False,False
4,Burrows20181117,Professional communicators produce a diverse r...,,"Burrows, E",10.1177/0163443718764807,"MEDIA, Culture and Society",{Indigenous MEDIA producers' perspectives on o...,https://www.scopus.com/inward/record.uri?eid=2...,2018,56,,,True,False,False



First few rows of dataset 2:



Unnamed: 0,(internal) id,(source) id,abstract,title,journal,authors,tags,consensus,labeled_at...9,code,stage_1,stage_2,stage_3
0,33937314,175,There is a worry that serious forms of politic...,Is Context the Key? The (Non-)Differential Eff...,Polit. Commun.,,,o,,-1,True,False,False
1,33937315,113,The electoral model of democracy holds the ide...,POLITICAL NEWS IN ONLINE AND PRINT NEWSPAPERS ...,Digit. Journal.,,,o,,-1,True,False,False
2,33937316,122,Machine learning is a field at the intersectio...,Machine Learning for Sociology,Annu. Rev. Sociol.,,,o,,-1,True,False,False
3,33937317,467,Research on digital glocalization has found th...,Improving Health in Low-Income Communities Wit...,J. Commun.,,,o,,-1,True,False,False
4,33937318,10,Political scientists often wish to classify do...,Using Word Order in Political Text Classificat...,Polit. Anal.,,,o,,-1,True,False,False


In [3]:
# Rename 'title' column to 'title_full' in df_BM for consistency
df_BM = df_BM.rename(columns={'title': 'title_full'})

# Verify the column was renamed successfully
print(f"✅ Column renamed successfully")
print(f"📊 df_BM columns: {list(df_BM.columns)}")
print(f"📏 df_BM shape: {df_BM.shape}")

✅ Column renamed successfully
📊 df_BM columns: ['(internal) id', '(source) id', 'abstract', 'title_full', 'journal', 'authors', 'tags', 'consensus', 'labeled_at...9', 'code', 'stage_1', 'stage_2', 'stage_3']
📏 df_BM shape: (917, 13)


In [4]:
# Save df_BM with timestamp to data directory
from datetime import datetime

# Generate timestamp in the same format as your other files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Create filename with timestamp
filename = f"SSOT_manual_BM_{timestamp}.csv"
filepath = f"../data/{filename}"

# Save the DataFrame
df_BM.to_csv(filepath, index=False)

# Confirm save
print(f"✅ df_BM saved successfully to: {filepath}")
print(f"📊 Saved {len(df_BM)} rows and {len(df_BM.columns)} columns")

✅ df_BM saved successfully to: ../data/SSOT_manual_BM_20250813_132621.csv
📊 Saved 917 rows and 13 columns
