In [5]:
import pandas as pd
import os


In [7]:
# Define the path to the metadata file
metadata_file_path = os.path.join('statista_dataset/dataset_copy', 'metadata.csv')

# Load the metadata into a DataFrame
metadata = pd.read_csv(metadata_file_path)

# Inspect the first few rows
print(metadata.head())


   id                                              title  \
0   1  \r\n                        Number of monthly ...   
1   2  \r\n                        United States: est...   
2   3  \r\n                        Reported violent c...   
3   4  \r\n                        Players with the m...   
4   5  \r\n                        Players with most ...   

                                   dataPath  \
0  statista_dataset/dataset_copy/data/1.csv   
1  statista_dataset/dataset_copy/data/2.csv   
2  statista_dataset/dataset_copy/data/3.csv   
3  statista_dataset/dataset_copy/data/4.csv   
4  statista_dataset/dataset_copy/data/5.csv   

                                    imgPath  \
0  statista_dataset/dataset_copy/imgs/1.png   
1  statista_dataset/dataset_copy/imgs/2.png   
2  statista_dataset/dataset_copy/imgs/3.png   
3  statista_dataset/dataset_copy/imgs/4.png   
4  statista_dataset/dataset_copy/imgs/5.png   

                                             caption  \
0  How many users

In [9]:
# Define columns to include
columns_to_include = ['imgPath', 'caption', 'chartType', 'xAxis', 'yAxis']

# Filter the metadata
filtered_metadata = metadata[columns_to_include]

# Save the filtered metadata to the same directory
filtered_metadata_file_path = os.path.join('statista_dataset/dataset_copy', 'filtered_metadata.csv')
filtered_metadata.to_csv(filtered_metadata_file_path, index=False)

print(f"Filtered metadata saved to '{filtered_metadata_file_path}'.")


Filtered metadata saved to 'statista_dataset/dataset_copy\filtered_metadata.csv'.


In [10]:
# Load the filtered metadata
filtered_metadata = pd.read_csv(filtered_metadata_file_path)

# Inspect the filtered metadata
print(filtered_metadata.head())


                                    imgPath  \
0  statista_dataset/dataset_copy/imgs/1.png   
1  statista_dataset/dataset_copy/imgs/2.png   
2  statista_dataset/dataset_copy/imgs/3.png   
3  statista_dataset/dataset_copy/imgs/4.png   
4  statista_dataset/dataset_copy/imgs/5.png   

                                             caption chartType       xAxis  \
0  How many users does Facebook have? With over 2...      line  Unnamed: 0   
1   This statistic presents the estimated net wor...       bar  Unnamed: 0   
2   This statistic shows the reported violent cri...      line  Unnamed: 0   
3   Emmitt Smith is the all-time rushing leader o...       bar  Unnamed: 0   
4   As of July 2020, Cristiano Ronaldo leads the ...       bar  Unnamed: 0   

                                               yAxis  
0                        Number of users in millions  
1                  Net worth in billion U.S. dollars  
2  Reported violent crime rate per 100,000 popula...  
3                           

In [11]:
# Check if all image paths are valid
invalid_img_paths = filtered_metadata[~filtered_metadata['imgPath'].apply(os.path.exists)]

if len(invalid_img_paths) > 0:
    print(f"Found {len(invalid_img_paths)} invalid image paths.")
    print(invalid_img_paths)
else:
    print("All image paths are valid.")


All image paths are valid.


In [12]:
# Inspect the data
print(f"Total charts: {len(metadata)}")
print(metadata.head())

Total charts: 27868
   id                                              title  \
0   1  \r\n                        Number of monthly ...   
1   2  \r\n                        United States: est...   
2   3  \r\n                        Reported violent c...   
3   4  \r\n                        Players with the m...   
4   5  \r\n                        Players with most ...   

                                   dataPath  \
0  statista_dataset/dataset_copy/data/1.csv   
1  statista_dataset/dataset_copy/data/2.csv   
2  statista_dataset/dataset_copy/data/3.csv   
3  statista_dataset/dataset_copy/data/4.csv   
4  statista_dataset/dataset_copy/data/5.csv   

                                    imgPath  \
0  statista_dataset/dataset_copy/imgs/1.png   
1  statista_dataset/dataset_copy/imgs/2.png   
2  statista_dataset/dataset_copy/imgs/3.png   
3  statista_dataset/dataset_copy/imgs/4.png   
4  statista_dataset/dataset_copy/imgs/5.png   

                                             caption 

In [13]:
# Count the number of charts for each chart type
chart_type_counts = metadata['chartType'].value_counts()

# Print chart type information
print("\n=== Chart Type Information ===")
print("Number of charts of each chart type")
for chart_type, count in chart_type_counts.items():
    print(f"{chart_type}: {count}")


=== Chart Type Information ===
Number of charts of each chart type
column: 16319
bar: 8272
line: 2646
pie: 408
table: 223


In [20]:
# Tokenize captions (split by whitespace)
metadata['token_count'] = metadata['caption'].apply(lambda x: len(x.split()))

# Compute token statistics
average_token_count = metadata['token_count'].mean()
total_tokens = metadata['token_count'].sum()
unique_tokens = len(set(" ".join(metadata['caption']).split()))

print("\n=== Token Information ===")
print(f"Average token count per summary: {average_token_count}")
print(f"Total tokens: {total_tokens}")
print(f"Total types (unique tokens): {unique_tokens}")



=== Token Information ===
Average token count per summary: 69.05533228075211
Total tokens: 1924434
Total types (unique tokens): 73296


In [19]:
# Count sentences (split by periods)
metadata['sentence_count'] = metadata['caption'].apply(lambda x: len(x.split('.')))

# Compute sentence statistics
average_sentence_count = metadata['sentence_count'].mean()

print("\n=== Sentence Information ===")
print(f"Average sentence count per summary: {average_sentence_count}")



=== Sentence Information ===
Average sentence count per summary: 4.932359695708339
