In [1]:
from pathlib import Path

import pandas as pd

from cleaner_lib import add_word_count, concat_str_cols

In [2]:
# set paths
data_p             = Path("../data")
nd_processed_p = data_p / "processed_nextdoor_data"
nd_cleaned_p = nd_processed_p / "cleaned_nextdoor_data.csv"
nd_out_p = nd_processed_p / "cleaned_nd_final_data.csv"

In [3]:
nd_df = pd.read_csv(nd_cleaned_p)
nd_df.head()

Unnamed: 0,NextDoorID,ShortLink,Author,Location/Date,Address,MainPost,TopComments,Neighborhood,Date
0,--3jc5nsXN58,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,Corridor/27 Jul,"4070-4080 Swift Avenue, San Diego, CA",how late can people be working on construction...,problem here is city code compliance inspector...,Corridor,07-27-2022
1,#NAME?,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,Montclair/8 Sep,,rain has finally arrived in north park but las...,it was just enough to rearrange the accumulate...,Montclair,09-08-2022
2,-3GwdKj4_sMm,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,17-Jul,,dont we have a water shortage...,jennifer that’s being addressed with a humongo...,,07-17-2022
3,-4qn3_2yNk_Y,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,Hillcrest Northeast/20 Aug,,guess nd didnt like my question about drinking...,nd has been aggressive about deleting comments...,Hillcrest Northeast,08-20-2022
4,#NAME?,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,Hillcrest Southeast/Edited 3 May 21,,day time robbery marston hillsupdate. update u...,wtf this sounds like mad max worldexactly. so ...,Hillcrest Southeast,05-03-2021


In [4]:
# make unique id
nd_df["id"] = nd_df.index + 1
nd_df["id"] = "nd" + nd_df["id"].astype("str")

In [5]:
# combine main post and top comment
nd_df.MainPost = nd_df.MainPost.fillna('')
nd_df.TopComments = nd_df.TopComments.fillna('')

nd_df = concat_str_cols(nd_df, "MainPost", "TopComments", "post_text")
nd_df = add_word_count(nd_df, 'post_text')
nd_df = nd_df.loc[nd_df.post_text_count > 0]
nd_df.head()

Unnamed: 0,NextDoorID,ShortLink,Author,Location/Date,Address,MainPost,TopComments,Neighborhood,Date,id,post_text,post_text_count
0,--3jc5nsXN58,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,Corridor/27 Jul,"4070-4080 Swift Avenue, San Diego, CA",how late can people be working on construction...,problem here is city code compliance inspector...,Corridor,07-27-2022,nd1,how late can people be working on construction...,131
1,#NAME?,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,Montclair/8 Sep,,rain has finally arrived in north park but las...,it was just enough to rearrange the accumulate...,Montclair,09-08-2022,nd2,rain has finally arrived in north park but las...,280
2,-3GwdKj4_sMm,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,17-Jul,,dont we have a water shortage...,jennifer that’s being addressed with a humongo...,,07-17-2022,nd3,dont we have a water shortage... jennifer that...,1250
3,-4qn3_2yNk_Y,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,Hillcrest Northeast/20 Aug,,guess nd didnt like my question about drinking...,nd has been aggressive about deleting comments...,Hillcrest Northeast,08-20-2022,nd4,guess nd didnt like my question about drinking...,82
4,#NAME?,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,Hillcrest Southeast/Edited 3 May 21,,day time robbery marston hillsupdate. update u...,wtf this sounds like mad max worldexactly. so ...,Hillcrest Southeast,05-03-2021,nd5,day time robbery marston hillsupdate. update u...,1853


In [6]:
nd_df.columns

Index(['NextDoorID', 'ShortLink', 'Author', 'Location/Date', 'Address',
       'MainPost', 'TopComments', 'Neighborhood', 'Date', 'id', 'post_text',
       'post_text_count'],
      dtype='object')

In [7]:
# prepared cleaned data and write out
nd_order = ['id', 'ShortLink', 'Author', 'post_text', 'post_text_count', 'Neighborhood']
nd_df = nd_df[nd_order]

nd_df.to_csv(nd_out_p, index=False)

nd_df.head()

Unnamed: 0,id,ShortLink,Author,post_text,post_text_count,Neighborhood
0,nd1,https://nextdoor.com/p/--3jc5nsXN58?view=detail,Hannah Lopez,how late can people be working on construction...,131,Corridor
1,nd2,https://nextdoor.com/p/--mjpdwdS3yx?view=detail,Tim Welch,rain has finally arrived in north park but las...,280,Montclair
2,nd3,https://nextdoor.com/p/-3GwdKj4_sMm?view=detail,News,dont we have a water shortage... jennifer that...,1250,
3,nd4,https://nextdoor.com/p/-4qn3_2yNk_Y?view=detail,Frank Negrete,guess nd didnt like my question about drinking...,82,Hillcrest Northeast
4,nd5,https://nextdoor.com/p/-5-J-BXgJ84y?view=detail,Dawn Burton,day time robbery marston hillsupdate. update u...,1853,Hillcrest Southeast


In [8]:
nd_df.sort_values(by="post_text_count")

Unnamed: 0,id,ShortLink,Author,post_text,post_text_count,Neighborhood
1559,nd1560,https://nextdoor.com/p/YgNXnJ7_TS5Z?view=detail,Laura Galbraith,rabies............click on pic,3,North Park-Morley Field
1753,nd1754,https://nextdoor.com/p/btdFw254nXsS?view=detail,Dan Bowles,homicide in hillcrest.,3,Georgia St. Bridge
1189,nd1190,https://nextdoor.com/p/QgPG5nWgQgr9?view=detail,News,thank you cal fire.,4,
1895,nd1896,https://nextdoor.com/p/fXnCKN-tW2mF?view=detail,The San Diego Union-Tribune,oceanside police want communitys input.,5,
2635,nd2636,https://nextdoor.com/p/wkShjKpMsHwz?view=detail,The San Diego Union-Tribune,one day with an el cajon cop.,7,
...,...,...,...,...,...,...
2099,nd2100,https://nextdoor.com/p/jq7RmBWdsxgX?view=detail,Eleanor Jacobs,“urban sprawl” that’s the term i heard on npr ...,2702,North Park Burlingame/Altadena
891,nd892,https://nextdoor.com/p/Jy9GrgmqwCLs?view=detail,Andy Hochman,very recently i went by an old residence where...,2714,University Heights Antique Row N
1167,nd1168,https://nextdoor.com/p/QLNgdMFmk3pp?view=detail,Jonathan W.,another attack by a homeless criminal averted ...,2902,North Linda Vista
1617,nd1618,https://nextdoor.com/p/Zj3rZ2LbDc32?view=detail,Andy Hochman,so the seven story monolith that is going up o...,2953,University Heights Antique Row N
