# Next steps to further Analyze OpenLibrary data

Let's clean up the data that we queried for further analysis:
- Null data
- Duplicates
- Outliers
- Scaling Numerical features
- Balancing
- Encoding categorical features

In [None]:
# Scale numerical Features

# Numerical columns
data_df[['first_publish_year','number_of_pages_median','ratings_average','ratings_sortable','ratings_count','readinglog_count','want_to_read_count','currently_reading_count','already_read_count']]
    


In [None]:
from sklearn.preprocessing import RobustScaler

# Step 0 - Instantiate Robust Scaler

rb_scaler = RobustScaler() 

# Step 1 - Fit the scaler to the 'number_of_pages_median'
# to "learn" the median value and the IQR

rb_scaler.fit(pop_df[['number_of_pages_median']])

# Step 2 - Scale / Transform
# to apply the transformation (value - median) / IQR for every house

#data['GrLivArea'] = rb_scaler.transform(data[['GrLivArea']]) 
pop_df['number_of_pages_median'] = rb_scaler.transform(pop_df[['number_of_pages_median']])

pop_df.head()

In [None]:
# Enrich language column to areduce amnt of NaN

# current list of all languages:

unique_langs=['eng']
#cycle through df and add any strings that are not already in the final list
def check_lang(row):
    if isinstance(row['language'], list):
        for lang in row['language']:
            if lang not in unique_langs:
                unique_langs.append(lang)
# for each row
data_df.apply(check_lang, axis=1)

print(unique_langs)

In [None]:
# Fill in missing languages:
import langid

# langid provides language in  ISO 639-1 whereas OpenLib appears to provide language in ISO 639-3 format
# Here is an ISO 639-1 to ISO 639-3 Mapping Dictionary:
iso639_1_to_3 = {
    'af': 'afr',  # Afrikaans
    'sq': 'sqi',  # Albanian
    'ar': 'ara',  # Arabic
    'hy': 'hye',  # Armenian
    'bn': 'ben',  # Bengali
    'bs': 'bos',  # Bosnian
    'ca': 'cat',  # Catalan
    'hr': 'hrv',  # Croatian
    'cs': 'ces',  # Czech
    'da': 'dan',  # Danish
    'nl': 'nld',  # Dutch
    'en': 'eng',  # English
    'eo': 'epo',  # Esperanto
    'et': 'est',  # Estonian
    'fi': 'fin',  # Finnish
    'fr': 'fra',  # French
    'de': 'deu',  # German
    'el': 'ell',  # Greek
    'gu': 'guj',  # Gujarati
    'he': 'heb',  # Hebrew
    'hi': 'hin',  # Hindi
    'hu': 'hun',  # Hungarian
    'is': 'isl',  # Icelandic
    'id': 'ind',  # Indonesian
    'it': 'ita',  # Italian
    'ja': 'jpn',  # Japanese
    'jw': 'jav',  # Javanese
    'kn': 'kan',  # Kannada
    'km': 'khm',  # Khmer
    'ko': 'kor',  # Korean
    'la': 'lat',  # Latin
    'lv': 'lav',  # Latvian
    'lt': 'lit',  # Lithuanian
    'mk': 'mkd',  # Macedonian
    'ml': 'mal',  # Malayalam
    'mn': 'mon',  # Mongolian
    'mr': 'mar',  # Marathi
    'my': 'mya',  # Burmese
    'ne': 'nep',  # Nepali
    'no': 'nor',  # Norwegian
    'or': 'ori',  # Odia
    'pa': 'pan',  # Punjabi
    'pl': 'pol',  # Polish
    'ps': 'pus',  # Pashto
    'pt': 'por',  # Portuguese
    'pa': 'pan',  # Punjabi
    'ro': 'ron',  # Romanian
    'ru': 'rus',  # Russian
    'sa': 'san',  # Sanskrit
    'sd': 'snd',  # Sindhi
    'si': 'sin',  # Sinhala
    'sk': 'slk',  # Slovak
    'sl': 'slv',  # Slovenian
    'es': 'spa',  # Spanish
    'su': 'sun',  # Sundanese
    'sw': 'swa',  # Swahili
    'sv': 'swe',  # Swedish
    'ta': 'tam',  # Tamil
    'te': 'tel',  # Telugu
    'th': 'tha',  # Thai
    'tr': 'tur',  # Turkish
    'uk': 'ukr',  # Ukrainian
    'ur': 'udm',  # Urdu
    'vi': 'vie',  # Vietnamese
    'cy': 'cym',  # Welsh
    'xh': 'xho',  # Xhosa
    'yi': 'yid',  # Yiddish
    'yo': 'yor',  # Yoruba
    'zu': 'zul'   # Zulu
}

# Function to Identify and Convert Language Codes
def convert_language_code(code):
    # Check if the code is in ISO 639-1
    if len(code) == 2 and code.isalpha():
        # Convert ISO 639-1 to ISO 639-3
        return iso639_1_to_3.get(code, code)
    elif len(code) == 3 and code.isalpha():
        # It's already in ISO 639-3
        return code
    else:
        # Invalid or unknown code
        return None

# Function to identify and add language for NaN values under 'language'
def add_lang(row):
    # For all columns with NaN
    if (not (isinstance(row['language'], list))) and pd.isna(row['language']):
        #print(row['language'])
        # Check language of title
        newlang, _ = langid.classify(row['title'])
        #print(f"{row['title']} is in {newlang}")
        # Convert language into ISO 639-3
        insert_lang = convert_language_code(newlang)
        #print(f"{newlang} is now {insert_lang}")
        # Insert language into list
        row['language'] = [insert_lang]
        # Add to unique languages list
        #check_lang(row['language'])
    return row

data_df = data_df.apply(add_lang, axis=1)


In [None]:
## SUBJECT ANALYSIS USING TFIDF VECTORIZER

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

# DataFrame
subject_df = data_df
subject_df = subject_df.dropna(subset=['subject'])

print(subject_df['subject'].apply(type).value_counts())

In [None]:


# Convert keywords to a format suitable for analysis
# Assuming 'subject' column contains lists of keywords
subject_df['subject'] = subject_df['subject'].apply(lambda x: ' '.join(x))

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(subject_df['subject'])
y = subject_df['readinglog_count']

In [None]:
# Linear Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)
print(f'R^2 score: {model.score(X_test, y_test)}')

# Visualization of keyword importance
feature_names = vectorizer.get_feature_names_out()
importance = model.coef_
keywords_importance = pd.DataFrame({'Keyword': feature_names, 'Importance': importance})

sns.barplot(x='Importance', y='Keyword', data=keywords_importance.sort_values(by='Importance', ascending=False))
plt.show()

In [None]:
import requests
import time
import json

def fetch_openlibrary_data(api_url, params=None, max_retries=5, sleep_between_retries=5):
    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(api_url, params=params)
            response.raise_for_status()  # Raise an error on bad status codes
            return response.json()
        except requests.exceptions.HTTPError as e:
            print(f"HTTP error occurred: {e}")
        except requests.exceptions.ConnectionError as e:
            print(f"Connection error occurred: {e}")
        except requests.exceptions.Timeout as e:
            print(f"Timeout occurred: {e}")
        except requests.exceptions.RequestException as e:
            print(f"General error occurred: {e}")
        
        retries += 1
        print(f"Retrying... ({retries}/{max_retries})")
        time.sleep(sleep_between_retries)

    return None

def fetch_all_data(base_url, total_records, records_per_page=100):
    all_data = []
    for start in range(0, total_records, records_per_page):
        params = {
            'offset': start,
            'limit': records_per_page
        }
        data = fetch_openlibrary_data(base_url, params=params)
        if data:
            all_data.extend(data['entries'])  # Adjust this based on the API response structure
            print(f"Retrieved {len(data['entries'])} records, Total: {len(all_data)}")
        else:
            print("Failed to retrieve data. Exiting.")
            break
        
        time.sleep(1)  # Throttle requests to avoid rate limiting

    return all_data

# Base API URL and query parameters
api_base_url = "https://openlibrary.org/search.json"
total_records = 200000  # Replace with your actual total record count
records_per_page = 100  # Adjust based on the API's maximum limit

# Fetch data
data = fetch_all_data(api_base_url, total_records, records_per_page)

# Save to a file
with open('openlibrary_data.json', 'w') as f:
    json.dump(data, f)

print(f"Total records fetched: {len(data)}")


In [None]:
# COMPRESSING:
data_df["first_publish_year"] = data_df["first_publish_year"].astype("int16")
"""
    "number_of_pages_median" : "float32",
    "ratings_average" : "float32",
    "ratings_sortable": "float32",
    "ratings_count": "int32",
    "readinglog_count": "int32",
    "want_to_read_count": "int32",
    "currently_reading_count": "int32",
    "already_read_count": "int32"
}
"""
data_df

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

def clean_string(sentence):
    """
    Arg is a string
    Perform cleaning functions to standardize string format and text for further analysis
    """
    if isinstance(sentence, (list, str)):
        # Basic cleaning
        sentence = sentence.strip() ## remove whitespaces
        sentence = sentence.lower() ## lowercase
        sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
        # Advanced cleaning
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '') ## remove punctuation
    
        tokenized_sentence = word_tokenize(sentence) ## tokenize
        stop_words = set(stopwords.words('english')) ## define stopwords
    
        tokenized_sentence_cleaned = [ ## remove stopwords
            w for w in tokenized_sentence if not w in stop_words
        ]
    
        lemmatized = [
            WordNetLemmatizer().lemmatize(word, pos = "v")
            for word in tokenized_sentence_cleaned
        ]
    
        cleaned_sentence = ' '.join(word for word in lemmatized)
        return cleaned_sentence
    else:
        print(f"{sentence} is not a list, nor a string")

In [None]:
# Create a column 'subject_clean' and put the result of our clean_string function into this column
data_df['subject_clean'] = data_df['subject'].apply(lambda lst: [clean_string(s) for s in lst] if isinstance(lst, list) else lst)

print(data_df['subject_clean'].head())

In [None]:
# List of widely recognized genres
book_genres = [
    "Literary Fiction",
    "Historical Fiction",
    "Science Fiction",
    "Fantasy",
    "Mystery",
    "Thriller",
    "Romance",
    "Horror",
    "Young Adult",
    "Dystopian",
    "Adventure",
    "Crime",
    "Magical Realism",
    "Graphic Novel",
    "Comic",
    "Biography",
    "Autobiography",
    "Memoir",
    "Self Help",
    "True Crime",
    "History",
    "Travel",
    "Science",
    "Philosophy",
    "Religion",
    "Spirituality",
    "Business",
    "Economics",
    "Health",
    "Fitness",
    "Politics",
    "Essays",
    "Cookbook",
    "Art",
    "Photography",
    "Poetry",
    "Drama",
    "Play",
    "Short Story",
    "Children",
    "New Adult",
    "Chick Lit",
    "Westerns",
    "Classics"
]

# Function to check if a string matches any genre in the list
def is_genre(string):
    for genre in book_genres:
        if string and string.lower() == genre.lower():
            return string.lower()
    return None

# Function to clean and extract genres from the subject_clean column
def cleaningr(lst):
    matched_genres = []
    for s in lst:
        genre = is_genre(s)
        if genre:
            matched_genres.append(genre)
    return matched_genres

# Apply the function to each row in the subject_clean column
data_df['genre'] = data_df['subject_clean'].apply(lambda lst: cleaningr(lst) if isinstance(lst, list) else [])

print(data_df)