In [31]:
import pandas as pd
import os
import django

os.environ["DJANGO_SETTINGS_MODULE"] = 'wemedia.settings'
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

In [2]:
df_input = pd.read_excel('data/booksuggestions_hot_25_24_Mar.xlsx')

In [3]:
df_input.head()

Unnamed: 0,submission_url,reddit_id,text,score,is_op,Topics,Book_1,Author_1,Book_2,Author_2,...,Author_4,Book_5,Author_5,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,https://www.reddit.com/r/booksuggestions/comme...,fnt620,Scythe by Neal shusterman || The arc of the sc...,121,True,,scythe,Neal shusterman,,,...,,,,,,,,,,
1,https://www.reddit.com/r/booksuggestions/comme...,flbcak0,"It’s soooo good, I’m probably gonna reread the...",15,False,,,,,,...,,,,,,,,,,
2,https://www.reddit.com/r/booksuggestions/comme...,flbyruh,the unwind series by shusterman is great too! ...,15,False,,The unwind,shusterman,,,...,,,,,,,,,,
3,https://www.reddit.com/r/booksuggestions/comme...,flblpeb,It’s on my “to-be-read” shelf! I was thinking ...,8,False,,,,,,...,,,,,,,,,,
4,https://www.reddit.com/r/booksuggestions/comme...,flbrv9z,Not to toot my own horn... But I havethe thund...,5,False,,the thunderd,,the toll,,...,,,,,,,,,,


In [6]:
from discuss.models import topic, content

In [51]:
#helper functions
def convert_string_to_list(text):
    '''
    Converts comma separated string to list
    '''
    if pd.isna(text):
        return []
    
    lst = text.split(',')
    
    return [x.strip() for x in lst]

def parse_text(text):
    '''
    If nan, returns blank. Else returns text.
    '''
    if pd.isna(text):
        return ""
    else:
        return text

def extract_books_from_row(row):
    '''
    Returns list of contents extracted for this row
    '''
    content_list = []
    if pd.isna(row["Book_1"]) == False or pd.isna(row["Author_1"]) == False:
        #print("Book_1 found")
        content_list.append(content(title = parse_text(row["Book_1"]), creator = parse_text(row["Author_1"]))) 

    if pd.isna(row["Book_2"]) == False or pd.isna(row["Author_2"]) == False:
        #print("Book_2 found")
        content_list.append(content(title = parse_text(row["Book_2"]), creator = parse_text(row["Author_2"])))
        
    if pd.isna(row["Book_3"]) == False or pd.isna(row["Author_3"]) == False:
        #print("Book_3 found")
        content_list.append(content(title = parse_text(row["Book_3"]), creator = parse_text(row["Author_3"])))
        
    if pd.isna(row["Book_4"]) == False or pd.isna(row["Author_4"]) == False:
        #print("Book_4 found")
        content_list.append(content(title = parse_text(row["Book_4"]), creator = parse_text(row["Author_4"])))
        
    if pd.isna(row["Book_5"]) == False or pd.isna(row["Author_5"]) == False:
        #print("Book_5 found")
        content_list.append(content(title = parse_text(row["Book_5"]), creator = parse_text(row["Author_5"])))
        
    return content_list

def persist_objects(object_list):
    '''
    persists list of contents to database
    '''
    for obj in object_list:
        obj.save()
        
def connect_content(parent_content_list, child_content_list):
    '''
    Establishes relationships between content
    '''
    for parent_content in parent_content_list:
        for child_content in child_content_list:
            parent_content.related_content.add(child_content)
            
def extract_topics_from_cell(cell):
    '''
    Returns list of contents extracted for this cell
    '''
    list_of_topic_strings = convert_string_to_list(row["Topics"])
    
    topic_list = []
    for topic_string in list_of_topic_strings:
        topic_list.append(parse_into_topic_object(topic_string))
        
    return topic_list

def parse_into_topic_object(topic_string):
    '''
    Returns existing or else newly created topic object 
    '''
    try:
        topic_object = topic.objects.get(title=topic_string)
        return topic_object
    except topic.DoesNotExist:
        #persist new entry into db
        topic_object = topic(title = topic_string)
        topic_object.save()
        return topic_object

def connect_topic(topic_list, content_list):
    '''
    Establishes relationships between topic and content
    '''
    for content in content_list:
        for topic in topic_list:
            content.topics.add(topic)

In [47]:
for i, row in df_input[:15].iterrows():
    print(row["Topics"])
    print(row["Book_1"])
    print(row["is_op"])
    print(row["submission_url"])
    print()

nan
scythe
True
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
The unwind
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
the thunderd
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shusterman/

nan
nan
False
https://www.reddit.com/r/booksuggestions/comments/fnt620/scythe_by_neal_shust

In [53]:
for i, row in df_input.iterrows():    
    if row["is_op"] == True:
        #reset all lists
        parent_topic_list = []
        parent_content_list = []
        child_topic_list = []
        child_content_list = []
        
        #parent (OP)
        parent_topic_list = extract_topics_from_cell(row["Topics"])
        
        parent_content_list = extract_books_from_row(row)
        persist_objects(parent_content_list)
        
        connect_topic(parent_topic_list, parent_content_list)
    else:
        #child (Comments)
        child_topic_list = extract_topics_from_cell(row["Topics"])
        
        child_content_list = extract_books_from_row(row)
        persist_objects(child_content_list)
        connect_content(parent_content_list, child_content_list) 
        
        connect_topic(child_topic_list, child_content_list)
        connect_topic(parent_topic_list, child_content_list)