## Compiling Scraping Results

Author: Miraya\
Purpose: Creating csv files for pyktok scraped data that is correlated with year and search term data 

In [1]:
import json
import os
import csv
import pandas as pd

In [2]:
#get dict with search term and video ID as key and value
def get_ID_search_term(json_dict):
    result = {}
    for key in json_dict:
        # key is each search term
        all_urls = json_dict[key]['urls']
        for url in all_urls:
            #get video ID
            vid_ID = int(url[len(url)-19:])
            result[vid_ID] = key
    return result

In [3]:
def add_vid_id(transcript_df):
    all_ids = []
    #transcript_df = pd.read_csv(filepath)
    for index, row in transcript_df.iterrows():
        transc_file_name = row['File Name']
        tfl = len(transc_file_name)
        transcript_id = int(transc_file_name[tfl-23:tfl-4])
        all_ids.append(transcript_id)
    transcript_df['video_id'] = all_ids
    return transcript_df

## Applying to all Result Files

In [37]:
def modify_csv():
    #get list of files to compile
    path = 'pyktok_csv_files'
    all_files = os.listdir(path)
    req_files = [file for file in all_files if 'failed' not in file]

    #these files have all the terms for both chunks
    json_path = 'json_result_files'
    all_json_files = os.listdir(json_path)

    transcript_path = 'transcript_csv_files'
    all_transcript_files = os.listdir(transcript_path)

    for f in req_files:
        #opening csv results file and downloading as pandas 
       # fipa2 = f'{path}/{f}'
      #  print(fipa2)
        try:
            df = pd.read_csv(os.path.join(path, f))
        except: 
            print(os.path.join(path, f))
        
        year = f[7:11]
        number = f[12:13]
        for j in all_json_files:
            if year in j:
                print(j)
                fipa = f'{json_path}/{j}'
                with open (fipa, 'r') as json_file:
                    data = json.load(json_file)
                    search_term_dct = get_ID_search_term(data)

                    search_term_column = []
                    for index, row in df.iterrows():
                        try:
                            id = row['video_id']
                            search_term_column.append(search_term_dct[id])
                        except  Exception as e:
                            print(f'{e}')
                            search_term_column.append('')
                        
                    df['search_term'] = search_term_column
                    df['year'] = [year] * df.shape[0]
                    

        for t in all_transcript_files:
            year_transcript = t[11:15]
            number_transcript = t[16:17]

            if year == year_transcript and number == number_transcript:
                transcript_df = add_vid_id(f'{transcript_path}/{t}')
                #transcript_df
                merged_df = pd.merge(df, transcript_df, on='video_id')

                df.drop(columns=['video_id'])
                
                for col in transcript_df.columns:
                    if col not in df.columns:
                        df[col] = transcript_df[col]    
            

        df.to_csv(f'final_result_{year}_{number}.csv')
        
             

In [6]:
def get_csv(year):
    print (f'STARTING FOR YEAR {year}')
    path = 'pyktok_csv_files'
    all_files = os.listdir(path)
    req_files = [file for file in all_files if 'failed' not in file]

    pyktok_files = [filename for filename in req_files if year in filename]
    
    if len(pyktok_files) == 2:
        pyktok_chunk1 = pd.read_csv(os.path.join(path, pyktok_files[0]))
        pyktok_chunk2 = pd.read_csv(os.path.join(path, pyktok_files[1]))
        pyktok_chunks = pd.concat([pyktok_chunk1, pyktok_chunk2])

    else:
        pyktok_chunks = pd.read_csv(os.path.join(path, pyktok_files[0]))

    #else: 
    #    print(f'length of pyktok files is not 2')
    #print(f'there should be two files, the filenames are {pyktok_files} and the number of files is: {len(pyktok_files)}')

    #these files have all the terms for both chunks
    json_path = 'json_result_files'
    all_json_files = os.listdir(json_path)
    json_file = [j for j in all_json_files if year in j][0]
    print('json file is' + json_file)

    #create dict 
    with open (os.path.join(json_path, json_file), 'r') as jsonFile:
        data = json.load(jsonFile)
        search_term_dct = get_ID_search_term(data)

    #transcript files read in
    transcript_path = 'transcript_csv_files'
    all_transcript_files = os.listdir(transcript_path)
    transcript_files = [t for t in all_transcript_files if year in t]
    print(f'no of transcript files found: {len(transcript_files)}')

    if len(transcript_files) == 2:
        t_chunk1 = pd.read_csv(os.path.join(transcript_path, transcript_files[0]))
        t_chunk2 = pd.read_csv(os.path.join(transcript_path, transcript_files[1]))
        t_chunks = pd.concat([t_chunk1, t_chunk2])

    print(f'there should be two files, the filenames are {pyktok_files} and the number of files is: {len(pyktok_files)}')

    #adding search term and year to pyktok df
    print('adding search term and year to pyktok df')
    search_term_column = []
    for index, row in pyktok_chunks.iterrows():
        try:
            id = row['video_id']
            search_term_column.append(search_term_dct[id])
        except  Exception as e:
            print(f'{e}')
            print(f'could not find video id from json dict for video {row["video_id"]}')
            search_term_column.append(None)
                        
    pyktok_chunks['search_term'] = search_term_column
    pyktok_chunks['year'] = [year] * pyktok_chunks.shape[0]

    #add video id as column to transcript dfs 
    transcript_df = add_vid_id(t_chunks)

    
    #print(transcript_df.columns)
    #print(list(pyktok_chunks.columns))
    #merge the pyktok and t dfs

    df = pyktok_chunks.merge(transcript_df, how='inner' , on='video_id')
    print(type(df))

    #pyktok_chunks.drop(columns=['video_id'])
   # for col in list(transcript_df.columns):
    print(list(df.columns))

    #pyktok_chunks['Subjectivity/Objectivity'] = transcript_df['Subjectivity/Objectivity']
    #pyktok_chunks['Content'] = transcript_df['Content']
    #pyktok_chunks['File Name'] = transcript_df['File Name']

    #transcript_df = transcript_df.add_prefix('transcript_')
    #print(list(transcript_df.columns))

    # Assign columns from transcript_df to pyktok_chunks
    #pyktok_chunks['Subjectivity/Objectivity'] = transcript_df['transcript_Subjectivity/Objectivity']
    #pyktok_chunks['Content'] = transcript_df['transcript_Content']
    #pyktok_chunks['File Name'] = transcript_df['transcript_File Name']


    

    df.to_csv(f'final_result_{year}.csv')
    

In [7]:
get_csv('2023')

STARTING FOR YEAR 2023
json file isurls2023may1.json
no of transcript files found: 2
there should be two files, the filenames are ['results2023_2.csv', 'urls2023may1_result1.csv', 'results2023_1.csv', 'urls2023_may2.csv', 'urls2023_may2.json_1-test_results.csv'] and the number of files is: 5
adding search term and year to pyktok df
6884964731921173766
could not find video id from json dict for video 6884964731921173766
6987410534706449691
could not find video id from json dict for video 6987410534706449691
7253096974545734938
could not find video id from json dict for video 7253096974545734938
7232285401296260378
could not find video id from json dict for video 7232285401296260378
7353247145769716998
could not find video id from json dict for video 7353247145769716998
7347668572329512197
could not find video id from json dict for video 7347668572329512197
7310324819416059142
could not find video id from json dict for video 7310324819416059142
6884964731921173766
could not find video id