In [2]:
cd /Users/nte/Documents/Chicago\ PhD\ Projects/Political\ culture\ maps/1.ANALYSIS


[WinError 3] The system cannot find the path specified: '/Users/nte/Documents/Chicago\\ PhD\\ Projects/Political\\ culture\\ maps/1.ANALYSIS'
c:\Users\asarr\Documents\Projects\pol_cul_maps_LA


!/usr/bin/env python coding: utf-8
Culture Maps Project

## Author: Alejandro Sarria-Morales
## Created: July 2024
## Date(last modified): July 26th 2024
## Data storage: 
    ### Merges website's source data (from csv) and text data dictionary (from json) and saves it as a csv. 
    ### It allows to create different data frames filtering by the number of words (e.g., all websites with less than 3k)
## Working with the Corpus of Contemporary American English (COCA)

## Notebook index:
    # 1. Libraries
    # 2. Helper functions
    # 3. Pipeline
    # 4. Save data frame as csv

# 1. Libraries

In [1]:
# # 1. Libraries 

import pandas as pd
import json
import csv


# 2. Helper functions

In [18]:
def creates_df(source_list, websites_text, max_words, full_data=False):
    '''
    Merges sources data and websites text data from dictionary (json) and creates a 
    pandas dataframe. 
    
    Inpputs:
        source_list: (list) websites urls
        websites_text: (dictionary) 
                    key - (int) id
                    value - (string) websites' texts 
        max_words: (int) creates cut off depending on the number of words per website
                    e.g., if checkpoing 1K => no website with more than 1K words enters
        full_data: (boolean) defines data output 
        
    Output:
        If argument full_data TRUE => returns the entire data frame with NAs in text cells that it did not append.
            => span_df (pandas data frame) 
        If argument full_data FALSE => returns the entire data frame with NAs in text cells that it did not append.
             => smaller_df (pandas data frame)
        Defalt is FALSE.
        
    '''
    coca_df = source_list
    
    coca_df['text'] = ''
    coca_df['id_string'] = coca_df['textID'].astype(str)

    c = 0
    for index, row in coca_df.iterrows():
        textid_value = row['id_string']
        if textid_value in websites_text and len(websites_text[textid_value]) < max_words:
            c = c + 1
            coca_df.at[index, 'text'] = websites_text[textid_value]
        else:
            coca_df.at[index, 'text'] = 'NA'

    if full_data:
        print("RETURNING FULL DATA FRAME")
        return coca_df
    else:
        print("RETURNING SMALL DATA FRAME. NO NAs.") 
        smaller_df = coca_df[coca_df['text'] != 'NA']
        print("Number of website's texts incuded in data frame:", c) 
        return smaller_df

In [3]:
def count_matches(source_list, websites_text):
    '''
    Inpputs:
        source_list: (list) websites urls
        websites_text: (dictionary) 
                    key - (int) id
                    value - (string) websites' texts 
    Output:
        c: (int) count of matches
    '''
    
    c = 0
    
    for index, row in source_list.iterrows():
        textid_value = row['id_string']
        if textid_value in websites_text:
            c = c + 1

    print("Number of matches: ids both in dicitonary and source:", c)

    return c
    

# 3. Pipeline

In [29]:
# Reading json 

json_file = r"../data/dict_coca_full.json"

with open(json_file, "r") as read_file:
    dict_text_websites = json.load(read_file)


In [14]:
# File path
file_path = '../data/sources_coca.csv'

# Read the CSV file into a pandas DataFrame
df_sources = pd.read_csv(file_path)

In [15]:
df_sources.columns

Index(['textID', '#words', 'year', 'genre', 'subgen', 'source', 'title'], dtype='object')

In [16]:
df_sources.shape

(248948, 7)

In [30]:
dict_text_websites

{'4138365': 'was no City of matter what the Left says  AFTER A TIME  ruts appear in the intellectual landscape  engraved through repetition of the same words  the same notions and incantations . " City of Hate " would be one of those  another  " right-wing hysteria "  also " paranoia  kooks  extremists  deranged  out of control . " The image of Dallas  Texas  the city where President Kennedy was slain in 1963  has the familiarity of a television commercial played so many times that reflex takes the place of reasoned assessment . Why analyze or appraise  Dallas  if it didn \' t gun down the president  certainly furnished the stage and props for a creep like Lee Harvey Oswald . What else is there  my friends  that \' s worth knowing   From the historical standpoint  that is . I \' m not convinced  actually  that vast numbers of Americans spend their days plotting to make the city of Dallas pay for the Dallas  by           The Immortals " until he became so at the Triple Underpass in Dall

In [21]:
df = creates_df(df_sources, dict_text_websites, 1000, full_data=False)

RETURNING SMALL DATA FRAME. NO NAs.
Number of website's texts incuded in data frame: 3121


In [11]:
df.shape

(223830, 9)

#### I was able to run the funciton and create an smaller data frame for websites with less than 1,000 words.

In [None]:
df_3k = creates_df(df_sources, dict_text_websites, 3000, full_data=False)
df.shape

In [33]:
df_10k = creates_df(df_sources, dict_text_websites, 100000, full_data=False)
df_10k.to_csv('../data/coca_full_text_100k_words.csv', index=False)

RETURNING SMALL DATA FRAME. NO NAs.
Number of website's texts incuded in data frame: 218347


In [27]:
# checkpoint (if needed)

count = count_matches(df_sources, dict_text_websites) 

Number of matches: ids both in dicitonary and source: 219337


### Interesting! From 21593 websites in the dictionary 21582 make match in the source!

# 4. Saves data frame

In [13]:
df.to_csv('span_full_text_1k_words.csv')

In [14]:
df_3k.to_csv('span_full_text_3k_words.csv')