In [4]:
import os
import zipfile
import pandas as pd
import json

In [5]:
def read_zipped_csvs(folder_path):
    # List all files in the folder
    all_files = os.listdir(folder_path)

    # Filter out only zip files
    zip_files = [file for file in all_files if file.endswith('.zip')]

    # Initialize a list to store individual DataFrames
    dataframes = []

    # Loop through each zip file
    for zip_file in zip_files:
        print(zip_file)
        # Open the zip file
        with zipfile.ZipFile(os.path.join(folder_path, zip_file), 'r') as z:
            # Loop through each file inside the zip file
            for file in z.namelist():
                # Check if the file is a CSV
                if file.endswith('.csv'):
                    # Read the CSV file into a DataFrame
                    df = pd.read_csv(z.open(file))
                    print(df.shape[0])

                    # Add the DataFrame to the list of DataFrames
                    dataframes.append(df)

    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(dataframes, ignore_index=True)

    return combined_df

In [6]:
folder_path = 'D:/kickstarter_data'
combined_df = read_zipped_csvs(folder_path)
print(combined_df.head())

Kickstarter_2022-01-20T03_20_11_451Z.zip
3672
3662
3662
3664
3656
3663
3660
3647
827
Kickstarter_2022-02-10T03_20_20_292Z.zip
3654
3649
3656
3662
3692
3669
3652
3554
Kickstarter_2022-03-24T03_20_19_285Z.zip
3640
3663
3669
3672
3666
3677
3683
3663
3653
1283
Kickstarter_2022-04-21T03_20_08_060Z.zip
3660
3663
3663
3662
3696
3681
3657
3664
1646
Kickstarter_2022-05-19T03_20_05_346Z.zip
3654
3665
3661
3657
3655
3660
3657
3667
3652
3660
3643
3664
3656
3668
3659
3659
3647
3666
3665
3663
3663
3661
3647
3662
3658
3665
3662
3657
3655
3654
3661
3658
3668
3658
3661
3641
3668
3662
3665
3656
3660
3649
3656
3660
3663
3669
3659
3656
3658
3656
3656
3667
3658
3655
3648
3666
3657
3666
3655
3665
3651
3662
3155
Kickstarter_2022-06-09T03_20_03_365Z.zip
3653
3667
3669
3662
3660
3662
3649
3663
3667
3662
3662
3662
3643
3656
3671
3661
3661
3672
3643
3660
3672
3666
3666
3663
3648
3665
3660
3670
3657
3661
3658
3650
3664
3675
3660
3665
3667
3648
3658
3664
3667
3658
3662
3660
3651
3662
3671
3659
3658
3660
3646
3664


In [7]:
combined_df.shape[0]

616363

In [8]:
#convert urls column from string to dict
combined_df['urls'] = combined_df['urls'].apply(lambda x: json.loads(x))
#extract project url from the urls column
combined_df['link_to_project'] = combined_df['urls'].apply(lambda x: x.get('web', {}).get('project'))


In [9]:
filtered_df = combined_df[combined_df['state'].isin(["successful", "failed"])]


In [10]:
filtered_df.shape[0]

574665

In [11]:
unique_count = filtered_df['id'].nunique()

In [12]:
unique_count

193343

In [13]:
filtered_df_1 = filtered_df[filtered_df['id'] == 1535549790]


In [14]:
filtered_df_1

Unnamed: 0,backers_count,blurb,category,converted_pledged_amount,country,country_displayable_name,created_at,creator,currency,currency_symbol,...,spotlight,staff_pick,state,state_changed_at,static_usd_rate,urls,usd_exchange_rate,usd_pledged,usd_type,link_to_project
3672,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
49697,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
74624,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
113830,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
271463,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
313927,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...
419263,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,domestic,https://www.kickstarter.com/projects/designmus...
483345,336,A quarterly magazine on design impact.,"{""id"":359,""name"":""Print"",""analytics_name"":""Pri...",34518.0,US,the United States,1575503001,"{""id"":116675903,""name"":""Design Museum Boston"",...",USD,$,...,True,True,successful,1584023419,1.0,{'web': {'project': 'https://www.kickstarter.c...,1.0,34518.0,international,https://www.kickstarter.com/projects/designmus...


In [15]:
unique_df = filtered_df.drop_duplicates(subset='blurb', keep='first')


In [16]:
unique_df.shape[0]

191867

In [17]:
unique_df['link_to_project'][1]

'https://www.kickstarter.com/projects/559623833/chirault-volume-2?ref=discovery_category_newest'

In [18]:
unique_df['cleaned_link_to_project'] = unique_df['link_to_project'].apply(lambda x: x.split('?')[0])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['cleaned_link_to_project'] = unique_df['link_to_project'].apply(lambda x: x.split('?')[0])


In [19]:
unique_df['cleaned_link_to_project'][0]

'https://www.kickstarter.com/projects/1881653671/medcomic-the-most-entertaining-way-to-study-medici'

In [95]:
from pprint import pprint
import requests
import os

# Structure payload.
output_dir = "D:/kickstarter_json/"

def fetch_data(url):

    payload = {
    'source': 'universal',
    'url': url,
    'user_agent_type': 'desktop',
    'geo_location': 'United States',
    
    }
    # Get response.
    response = requests.request(
    'POST',
    'https://realtime.oxylabs.io/v1/queries',
    auth=('amitoj1996', 'Amitoj1996'),
    json=payload,
    )
    return response.json()
# Instead of response with job status and results url, this will return the
# JSON response with the result.


In [96]:
def get_id_1(url):
    id_1  = unique_df.loc[unique_df['cleaned_link_to_project'] == url,'id'].values[0]
    return id_1

In [97]:
def process_url(url):
    #url = row['cleaned_link_to_project']
    
    id_1 = get_id_1(url)
    
    
    file_name = f'response_{id_1}.json'
    output_path = os.path.join(output_dir, file_name)
    
    if os.path.exists(output_path):
        print(f"Skipping {id_1}, file already exists.")
        return
    
    # Fetch data
    data = fetch_data(url)
    # Save the JSON response to a file for each URL
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile)

In [99]:
import os
import concurrent.futures

start = 1000
end = 4000


urls = unique_df['cleaned_link_to_project'].iloc[start:end].tolist()

max_workers = 16  # Adjust this value based on your machine and network capacity
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    executor.map(process_url, urls)

In [92]:
for index, row in unique_df.head(1000).iterrows():
    url = row['cleaned_link_to_project']
    id_1 = row['id']
    
    
    file_name = f'response_{id_1}.json'
    output_path = os.path.join(output_dir, file_name)
    
    if os.path.exists(output_path):
        print(f"Skipping {id_1}, file already exists.")
        continue
    
    # Fetch data
    data = fetch_data(url)
    # Save the JSON response to a file for each URL
    with open(output_path, 'w') as outfile:
        json.dump(data, outfile)

Skipping 1767289981, file already exists.
Skipping 1210231263, file already exists.


KeyboardInterrupt: 

In [49]:
import os
import json
from bs4 import BeautifulSoup

#creates a dictionary with 

# Create an empty dictionary to store the extracted text
text_dict = {}
img_dict = {}
rejected_files = []
# Loop through all the files in the kickstarter_json folder
for filename in os.listdir('kickstarter_json'):
    img_count = 0
    # Check if the file is a JSON file
    if filename.endswith('.json'):
        # Open the file and extract the HTML content
        with open(os.path.join('kickstarter_json', filename), 'r') as f:
            data = json.load(f)
            html = data['results'][0]['content']
        
        # Parse the HTML with BeautifulSoup and extract the text
        print(filename)
        soup = BeautifulSoup(html, 'html.parser')
        start_tag = soup.find('a', text='Story')
        
        
        if start_tag is None:
            print(filename + " incorrectly scraped")
            rejected_files.append(filename)
            continue
        first_p_tag = start_tag.find_next('p')
        end_tag = first_p_tag.find_next('a')
        text = ''
        for tag in start_tag.find_all_next():
            if tag == end_tag:
                break
            elif tag.name == 'p':
                text += tag.text.strip() + '\n'
            elif tag.name == 'img':
                img_count += 1
            
        # Add the extracted text to the dictionary
        text_dict[filename] = text
        img_dict[filename] = img_count

# Print the resulting dictionary
#print(text_dict)


response_1000659557.json
response_1000866606.json
response_1001570537.json
response_1002418598.json
response_1002418598.json incorrectly scraped
response_1002531622.json
response_1002638101.json
response_1002684709.json
response_1002684709.json incorrectly scraped
response_1002966619.json
response_1003075861.json
response_100324011.json
response_100324011.json incorrectly scraped
response_1003583457.json
response_1003886371.json
response_1003913695.json
response_1003913695.json incorrectly scraped
response_1004781384.json
response_1005413300.json
response_1005414218.json
response_100562176.json
response_1006075060.json
response_1006762726.json
response_1007021695.json
response_1007217653.json
response_1007217653.json incorrectly scraped
response_100764449.json
response_1007687147.json
response_1007692394.json
response_1007766170.json
response_1008286001.json
response_1008512082.json
response_1009143503.json
response_100929789.json
response_1009317190.json
response_1009329116.json
respo

response_1108094539.json
response_1108157464.json
response_1108307132.json
response_1108478519.json
response_1108734922.json
response_1108801091.json
response_1109450833.json
response_1109469143.json
response_1109469143.json incorrectly scraped
response_111138226.json
response_1111460590.json
response_1111749064.json
response_1111749064.json incorrectly scraped
response_1111758761.json
response_1111948161.json
response_1112228945.json
response_1112636308.json
response_1113095505.json
response_1113313615.json
response_111386040.json
response_1114867227.json
response_1114867227.json incorrectly scraped
response_1115386450.json
response_1116405823.json
response_1116578885.json
response_1116615614.json
response_1116615614.json incorrectly scraped
response_1116695191.json
response_1116969461.json
response_1117253767.json
response_1117973086.json
response_111925692.json
response_111947874.json
response_112008136.json
response_112072987.json
response_112072987.json incorrectly scraped
respons

response_1225772028.json
response_1226422146.json
response_1226755806.json
response_1226900948.json
response_1226925427.json
response_1227100990.json
response_1227150388.json
response_1227441606.json
response_1227483133.json
response_1227483133.json incorrectly scraped
response_1227695889.json
response_1228741764.json
response_1229266414.json
response_1229703080.json
response_1231484240.json
response_1231910994.json
response_1232471340.json
response_1233023673.json
response_1233449306.json
response_1234001474.json
response_1234022955.json
response_1234022955.json incorrectly scraped
response_1235233403.json
response_123535976.json
response_1235364811.json
response_1236252726.json
response_1236252726.json incorrectly scraped
response_1236312568.json
response_1236312568.json incorrectly scraped
response_1236654368.json
response_1238036689.json
response_1238223670.json
response_1238577900.json
response_1238577900.json incorrectly scraped
response_123865579.json
response_123865579.json inc

response_1336943577.json
response_1337690546.json
response_1337954163.json
response_1337954163.json incorrectly scraped
response_1338458665.json
response_1339298935.json
response_1339421234.json
response_1339542178.json
response_1339542178.json incorrectly scraped
response_1341443874.json
response_134157181.json
response_1341575056.json
response_1341591324.json
response_1342269649.json
response_1342400935.json
response_1342649460.json
response_1342821401.json
response_1342821401.json incorrectly scraped
response_1343480382.json
response_1345552895.json
response_1345928873.json
response_1345928873.json incorrectly scraped
response_134624795.json
response_1347458488.json
response_1347850702.json
response_1348612950.json
response_1349857894.json
response_1350799353.json
response_1351065093.json
response_135147988.json
response_135248120.json
response_1352908659.json
response_135347661.json
response_135347661.json incorrectly scraped
response_1353711371.json
response_1354128617.json
respon

response_1450759876.json
response_1450827517.json
response_1450938906.json
response_1451014925.json
response_1451429669.json
response_1451677014.json
response_1451768623.json
response_1452310711.json
response_1452566492.json
response_1452566492.json incorrectly scraped
response_1452596009.json
response_145398052.json
response_1454451657.json
response_1454842559.json
response_1456188848.json
response_1456188848.json incorrectly scraped
response_1456316988.json
response_1456814516.json
response_1458133623.json
response_1458133623.json incorrectly scraped
response_1459355315.json
response_1459801929.json
response_1462792355.json
response_1463299604.json
response_1463523716.json
response_1464113345.json
response_1464113345.json incorrectly scraped
response_146609975.json
response_1466944870.json
response_1467185279.json
response_1467185279.json incorrectly scraped
response_1467581633.json
response_1468649714.json
response_1468649714.json incorrectly scraped
response_1468839061.json
respons

response_1578419865.json incorrectly scraped
response_1578993195.json
response_1578993195.json incorrectly scraped
response_1579002633.json
response_1579900302.json
response_1579908125.json
response_1579940360.json
response_158036737.json
response_158036737.json incorrectly scraped
response_1580677812.json
response_158131171.json
response_1581491576.json
response_1581571422.json
response_1581612174.json
response_1582019174.json
response_1582287582.json
response_1582684535.json
response_1583199994.json
response_1583359597.json
response_1584391308.json
response_1584771396.json
response_1586142311.json
response_1587380466.json
response_1587719498.json
response_1587719498.json incorrectly scraped
response_1587845594.json
response_1588005168.json
response_1589993456.json
response_1590677905.json
response_1590686794.json
response_1590824250.json
response_1591056786.json
response_1591578807.json
response_1592486007.json
response_1592486007.json incorrectly scraped
response_1594337930.json
res

response_1696517441.json
response_1697995072.json
response_1698121041.json
response_1698591968.json
response_1698707842.json
response_1698759396.json
response_1698898403.json
response_1699285868.json
response_1700051924.json
response_1700570817.json
response_1700939683.json
response_1701119401.json
response_1701754007.json
response_1701800413.json
response_1702718129.json
response_1702718129.json incorrectly scraped
response_1702895696.json
response_1705248090.json
response_1705350968.json
response_1705350968.json incorrectly scraped
response_1705547277.json
response_1705647730.json
response_1706452808.json
response_1706485167.json
response_170804878.json
response_170804878.json incorrectly scraped
response_1709409756.json
response_1709623619.json
response_1710655298.json
response_1711103176.json
response_1711285120.json
response_1711383284.json
response_1712085518.json
response_1712085518.json incorrectly scraped
response_1712444486.json
response_1712559637.json
response_1712699784.js

response_1806698114.json
response_1806698114.json incorrectly scraped
response_1806784673.json
response_1806860741.json
response_1806860741.json incorrectly scraped
response_1807251410.json
response_1807251410.json incorrectly scraped
response_1807677509.json
response_180770857.json
response_1807785663.json
response_1807785663.json incorrectly scraped
response_1808104361.json
response_1808352924.json
response_1808594715.json
response_1809196302.json
response_1809322151.json
response_180952195.json
response_1810490649.json
response_181151340.json
response_181151340.json incorrectly scraped
response_1812319410.json
response_1814569577.json
response_1815565171.json
response_1815946706.json
response_1815946706.json incorrectly scraped
response_1816216840.json
response_181662792.json
response_1817881246.json
response_1817881246.json incorrectly scraped
response_1818184116.json
response_1818593797.json
response_1818593797.json incorrectly scraped
response_1819583652.json
response_1819583652.

response_1922984876.json
response_1922984876.json incorrectly scraped
response_1923011399.json
response_1923011399.json incorrectly scraped
response_1923326701.json
response_1923457728.json
response_1924926316.json
response_1924926316.json incorrectly scraped
response_1925031916.json
response_1925159628.json
response_1925169428.json
response_1925221461.json
response_1926029185.json
response_1926029185.json incorrectly scraped
response_1926518106.json
response_1926802718.json
response_1927339915.json
response_1927501511.json
response_1927608194.json
response_1927771435.json
response_1927771435.json incorrectly scraped
response_192891530.json
response_1929048313.json
response_1929581428.json
response_1929731617.json
response_1930058196.json
response_1930949900.json
response_1931128406.json
response_1932407710.json
response_1933657522.json
response_1934379136.json
response_1936965392.json
response_1937104206.json
response_193738321.json
response_1937954943.json
response_1937991320.json
re

response_2046909184.json incorrectly scraped
response_204709353.json
response_2047937878.json
response_204833530.json
response_2049278985.json
response_2050714855.json
response_2051025605.json
response_2051025605.json incorrectly scraped
response_2052010901.json
response_2052068719.json
response_2052068719.json incorrectly scraped
response_2052329512.json
response_2052900834.json
response_2053633499.json
response_2053881291.json
response_2054352449.json
response_2054976777.json
response_2055108152.json
response_2055322753.json
response_2055606340.json
response_205743330.json
response_2057443090.json
response_2057579428.json
response_2057579428.json incorrectly scraped
response_2057660439.json
response_205768400.json
response_2058144969.json
response_2058272369.json
response_2058501685.json
response_205861993.json
response_2058664582.json
response_2058982207.json
response_2059230942.json
response_2059914000.json
response_2060256208.json
response_2060984097.json
response_2061192428.json


response_229202332.json
response_229266301.json
response_229308472.json
response_229423445.json
response_229617840.json
response_229812938.json
response_229958633.json
response_230631978.json
response_230790639.json
response_231581263.json
response_231719478.json
response_232215261.json
response_232215261.json incorrectly scraped
response_232302370.json
response_233568306.json
response_233928723.json
response_233928723.json incorrectly scraped
response_234628109.json
response_234915033.json
response_234915033.json incorrectly scraped
response_23543343.json
response_23543343.json incorrectly scraped
response_236209912.json
response_236437561.json
response_236936837.json
response_236960569.json
response_236970136.json
response_238126473.json
response_238356148.json
response_238771016.json
response_238771687.json
response_23905613.json
response_239316833.json
response_239608824.json
response_240519281.json
response_241081588.json
response_24118134.json
response_241520926.json
response_242

response_351607786.json
response_351713300.json
response_352796885.json
response_353638333.json
response_354253718.json
response_354253718.json incorrectly scraped
response_35438930.json
response_354681629.json
response_355370289.json
response_355548423.json
response_355548423.json incorrectly scraped
response_355574916.json
response_356332982.json
response_356901659.json
response_356904697.json
response_357030926.json
response_357266890.json
response_357266890.json incorrectly scraped
response_357284769.json
response_357641420.json
response_357641420.json incorrectly scraped
response_359441564.json
response_359502142.json
response_359727957.json
response_359784465.json
response_360623472.json
response_361247839.json
response_361303607.json
response_36144126.json
response_36144126.json incorrectly scraped
response_362069867.json
response_362502398.json
response_362776536.json
response_362776536.json incorrectly scraped
response_362831119.json
response_363641846.json
response_365097070.

response_48501048.json
response_485017486.json
response_486254765.json
response_486254765.json incorrectly scraped
response_486356651.json
response_486398172.json
response_486398172.json incorrectly scraped
response_486514686.json
response_48714535.json
response_487705518.json
response_488551219.json
response_489592888.json
response_490233090.json
response_491366008.json
response_491821764.json
response_492013415.json
response_492564826.json
response_492605314.json
response_492605314.json incorrectly scraped
response_492736800.json
response_492736800.json incorrectly scraped
response_494594526.json
response_494659268.json
response_494659268.json incorrectly scraped
response_495504943.json
response_496276565.json
response_496731550.json
response_496731550.json incorrectly scraped
response_496745705.json
response_496994635.json
response_497059393.json
response_497230320.json
response_497230320.json incorrectly scraped
response_497729662.json
response_498833637.json
response_499528406.jso

response_609526988.json incorrectly scraped
response_609864832.json
response_609864832.json incorrectly scraped
response_610009570.json
response_610009570.json incorrectly scraped
response_610359335.json
response_610951602.json
response_610951602.json incorrectly scraped
response_610960475.json
response_610985535.json
response_611686167.json
response_611765441.json
response_612068724.json
response_61224504.json
response_61224504.json incorrectly scraped
response_61226445.json
response_612651151.json
response_613018582.json
response_613248721.json
response_613600913.json
response_614828885.json
response_615160743.json
response_615186837.json
response_615791200.json
response_615835066.json
response_616191779.json
response_616545743.json
response_616545743.json incorrectly scraped
response_616639458.json
response_616639458.json incorrectly scraped
response_617311042.json
response_618139743.json
response_618139743.json incorrectly scraped
response_618594919.json
response_618945040.json
res

response_7276164.json incorrectly scraped
response_727677710.json
response_728483948.json
response_728483948.json incorrectly scraped
response_728903394.json
response_72916421.json
response_73117631.json
response_731910136.json
response_732958929.json
response_732958929.json incorrectly scraped
response_73371979.json
response_73371979.json incorrectly scraped
response_734961100.json
response_734961100.json incorrectly scraped
response_735487661.json
response_735487661.json incorrectly scraped
response_735811281.json
response_735853103.json
response_735990754.json
response_736017505.json
response_736547996.json
response_736547996.json incorrectly scraped
response_73682242.json
response_737043303.json
response_737097965.json
response_737416616.json
response_737669022.json
response_73799682.json
response_73799682.json incorrectly scraped
response_738023007.json
response_738023007.json incorrectly scraped
response_738487317.json
response_739340404.json
response_740005708.json
response_7400

response_84720579.json
response_847605407.json
response_847605407.json incorrectly scraped
response_848055409.json
response_848899406.json
response_849997057.json
response_850183256.json
response_850698683.json
response_85081659.json
response_851727841.json
response_85208340.json
response_852583995.json
response_853989165.json
response_854348540.json
response_855034939.json
response_855456826.json
response_855983144.json
response_856867677.json
response_857208495.json
response_857799723.json
response_858047015.json
response_858448285.json
response_858642578.json
response_858642578.json incorrectly scraped
response_859082773.json
response_859082773.json incorrectly scraped
response_860061738.json
response_860512080.json
response_861658904.json
response_862656167.json
response_862709910.json
response_865177001.json
response_865798864.json
response_866113201.json
response_866634035.json
response_867168840.json
response_867862536.json
response_868318298.json
response_868792666.json
respons

response_97121282.json incorrectly scraped
response_971679256.json
response_971679256.json incorrectly scraped
response_972196106.json
response_972196106.json incorrectly scraped
response_973188834.json
response_973560351.json
response_973883863.json
response_974016282.json
response_974016282.json incorrectly scraped
response_974562133.json
response_974931176.json
response_975550220.json
response_975640050.json
response_975924788.json
response_976550591.json
response_977135218.json
response_977247283.json
response_977247283.json incorrectly scraped
response_978449413.json
response_978809189.json
response_979205048.json
response_979350508.json
response_979350508.json incorrectly scraped
response_979699838.json
response_980030500.json
response_980088143.json
response_980835861.json
response_981204636.json
response_981296428.json
response_981296428.json incorrectly scraped
response_982282766.json
response_982867900.json
response_983111069.json
response_983595623.json
response_983595623.js

In [30]:
count=0
for key, value in text_dict.items():
    if count == 5:
        break
    print(key, value)
    count += 1

response_1000659557.json a horror punk band! Coming from people who just want to play and write, it has been our dream for years! To create something that is unique something that will leave our mark on this great world! We have started writing however, our equipment is meager at best  witch puts a damper on being able to create and share our music! We would love to be able to get to work and produce something great and start doing shows all we need is a little kickstart! We are starting from scratch! We still need a drummer in wich we are searching for I'm casey aka "carnage" I have been playing guitar for 11yrs  we have Dave on vocs aka "sid" who has been vocalist for many years he is a very wonderful lyrist and my brother for the past 8yrs and on bass emanuel who we belive was born with a bass in his hands lol  to us this is about bringing our talent and love of music to the next level!
Thank you for reading,
New world disorder.
We really don't see any risks at the moment it's just 

In [53]:
id_value = 1000659557

key = "reason_1000659557.json" + str(id_value) + ".json"
print(img_dict.get('response_1000866606.json', ''))

70


In [45]:
text_dict['response_1000866606.json']

"What if, in a few years, a parent doesn’t know any better than to playfully practice getting out of the water after an accidental fall with their kids aged 2 years and up? \xa0By making it fun!\nJoin now and Am-Fibby-Fy humanity with us!\nHugo van der Spek , Founder of Fibby\n👇Watch here👇 how Mik (2), Evi (3) and Jip (4) become more ‘water safe’ with Fibby by playing with their parents. And pay special attention to the fun they have with it.\n⚠️ Let children always wear a Fibby and never let them out of your sight\nHowever, by playfully practicing 'jump in & climb out', 'float & propel', and 'breathe & be underwater' eventually even without Fibby, you significantly minimize the chance of drowning accidents even when they don't wear a Fibby.\nIn the future, \xa0a part of your Fibby purchase will contribute directly to the Dutch Don't Drown Foundation. But first, let's bring Fibby to life together on Kickstarter!👇Watch here👇 our first pilot with the Dutch Don’t Drown Foundation in South

In [50]:


# Custom function to map id to description value in the dictionary
def id_to_description(id_value):
    key = "response_" + str(id_value) + ".json"
    return text_dict.get(key, '')

def img_count_to_img_count(id_value):
    key = "response_" + str(id_value) + ".json"
    return img_dict.get(key, '')

# Create a new column called 'description' and map the 'id' column values to their corresponding description values
unique_df['description'] = unique_df['id'].apply(id_to_description)
unique_df['img_count'] = unique_df['id'].apply(img_count_to_img_count)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['description'] = unique_df['id'].apply(id_to_description)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['img_count'] = unique_df['id'].apply(img_count_to_img_count)


In [52]:
unique_df['img_count']

0         3
1         2
2          
3         2
4          
         ..
614453     
614454     
615111     
615179     
615180     
Name: img_count, Length: 191867, dtype: object

In [25]:
result = unique_df[unique_df['description'].notnull()]


0          
1          
2          
3          
4          
         ..
614453     
614454     
615111     
615179     
615180     
Name: description, Length: 191867, dtype: object