In [89]:
import pandas as pd
import numpy as np 
from datetime import datetime
import mailbox
import regex as re 
import os 
import boto3 
import gzip
from dotenv import load_dotenv, find_dotenv
import wget

In [51]:
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
s3_secret_key = os.environ['AWS_SECRET_ACCESS_KEY']
s3_bucket_name = os.environ['JUPYTERHUB_USER']
s3_endpoint_url = os.environ['S3_ENDPOINT_URL']
s3_access_key = os.environ['AWS_ACCESS_KEY_ID']
s3bucket = os.environ['BUCKET']

print(s3_endpoint_url)
print(s3_bucket_name)
s3 = boto3.client('s3','us-east-1', endpoint_url= s3_endpoint_url,
                       aws_access_key_id = s3_access_key,
                       aws_secret_access_key = s3_secret_key)

https://s3.upshift.redhat.com
cdolfi


The "msgs" portion is where the errors come with trying to get the mbox

In [52]:
def gunzip(source_filepath, dest_filepath, block_size=65536):
    with gzip.open(source_filepath, 'rb') as s_file, \
            open(dest_filepath, 'wb') as d_file:
        while True:
            block = s_file.read(block_size)
            if not block:
                break
            else:
                d_file.write(block)

In [53]:
#how to get text from a message in the mbox 
def get_text(msg):
    while msg.is_multipart():
        msg = msg.get_payload()[0]
    return msg.get_payload()

In [54]:
#strip thread text and dates in body text
def strip_thread(text):
    text = text.replace("\r", "")
    lines = text.split("\n")
    lines = [l for l in lines if len(l) > 0]
    lines = [line for line in lines if line[0] != ">"]
    lines = [line for line in lines if line[:3] != "Re:"]
    lines = [line for line in lines if line[:7] != "Subject"]
    lines = [line for line in lines if line[:5] != "From:"]
    lines = [line for line in lines if line[:5] != "Date:"]
    lines = [line for line in lines if "BEGIN PGP SIGNED MESSAGE" not in line]
    lines = [line for line in lines if line[:5] != "Hash:"]
    lines = [line for line in lines if line[:10] != "Version: G"]
    lines = [line for line in lines if "wrote:" not in line]
    lines = [line for line in lines if "wrote :" not in line]
    lines = [line for line in lines if "writes:" not in line]
    lines = [line for line in lines if line[:7] != "Am Mit,"]
    lines = [line for line in lines if line[:7] != "Am Don,"]
    lines = [line for line in lines if line[:7] != "Am Mon,"]
    lines = [line for line in lines if line[:7] != "Quoting"]
    lines = [line for line in lines if line[:10] != "Em Quinta,"]
    lines = [line for line in lines if "said:" not in line]
    lines = [line for line in lines if re.match(".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), .. (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) 20..*", line) == None]
    lines = [line for line in lines if re.match(".*n (Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday) .. (January|February|March|April|May|June|July|August|September|October|November|December) 20..*", line) == None]
    lines = [line for line in lines if re.match(".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Sept|Oct|Nov|Dec) .., 20..*", line) == None]
    lines = [line for line in lines if re.match(".*n (Sun|Mon|Tue|Wed|Thu|Fri|Sat), 20[\d]{2}-[\d]{2}-[\d]{2} at.*", line) == None]
    lines = [line for line in lines if line[-6:] != "said: "]
    lines = [line for line in lines if line[-8:] != "babbled:"]
    lines = [line for line in lines if line[-7:] != "wrot=e:"]
    lines = [line for line in lines if line[-8:] != "A9crit :"]
    lines = [line for line in lines if line[0] != "|"]
    return "\n".join(lines)

In [55]:
#build 2d list of email data with thread text and dates removed
def build_list(mbox):
    chart = [] 
    for msg in mbox: 
        clean_body = strip_thread(get_text(msg))
        entry = [clean_body, msg["Date"][:-9], msg["From"], msg["Subject"], msg["Message-ID"], msg["In-Reply-To"]]
        chart.append(entry)
    return chart
    

In [56]:
#format for CSV, clean special characters, and remove extranous emails 
def pandas_clean(chart):
    emails = pd.DataFrame(chart, columns = ["Body", 'Date', "From", "Subject", "Message ID", "In-Reply"]) 
    emails['Body'].replace(to_replace=[r"\n", "\n",], value= " ", regex=True, inplace=True)
    emails['Body'].replace(to_replace=[r"\'", "\'",">",'<',"= ", "-", "http\S+" ], value="", regex=True, inplace=True)
    emails['Body'].replace(to_replace=[ r"\\\s+", "\\\s+" , "="], value= "", regex=True, inplace=True)
    emails['Body'].replace(to_replace= [ "   ", "  "], value= " ", regex=True, inplace=True)
    emails['Body'].replace(to_replace= ["_","3D"], value= "", regex=True, inplace=True)
    emails['Body'].replace(to_replace= [ "   ", "  "], value= " ", regex=True, inplace=True)
    emails['Body'].replace(to_replace= [ "   ", "  "], value= " ", regex=True, inplace=True)
    
    emails.drop(emails.index[emails['Body'] == ''], inplace = True)
    emails.drop(emails.index[emails['Body'] == ' '], inplace = True)
    emails.drop(emails.index[emails['Body'] == '+1'], inplace = True)
    emails.dropna(subset=['Body'], inplace=True)
    emails['location'] = emails['Body'].str.find('Missing expected images')
    
    emails.drop(emails.index[emails['location'] == 0], inplace = True)
    emails.drop('location' ,axis=1, inplace=True )
    emails['location'] = emails['Body'].str.find('OLD: Fedora')
    emails.drop(emails.index[emails['location'] == 0], inplace = True)
    emails.drop('location' ,axis=1, inplace=True )
    
    emails['datetime'] = pd.to_datetime(emails['Date'], format='%a, %d %b %Y %H:%M')
    emails.sort_values(by= 'datetime', inplace = True)
    
    emails = emails.reset_index()
    emails.drop('index',axis=1, inplace=True )
    return emails
    

In [88]:
#EDIT
f_list = 'user'
start_year = '2006'
start_month = '06'
finish_year = '2006'
finish_month = '09'



if f_list == 'devl':
    s = "https://lists.fedoraproject.org/archives/list/devel@lists.fedoraproject.org/export/devel@lists.fedoraproject.org.mbox.gz?start="
    name = "../data/raw/devel@lists.fedoraproject.org.mbox.gz"
elif f_list == 'user':
    s = "https://lists.fedoraproject.org/archives/list/users@lists.fedoraproject.org/export/users@lists.fedoraproject.org.mbox.gz?start="
    name = "../data/raw/users@lists.fedoraproject.org.mbox.gz"



s = f"{s}{start_year}-{start_month}-01&end={finish_year}-{finish_month}-01"
print(s)

wget.download(s, out="../data/raw")

gunzip(name,'../data/interim/pulled.mbox' )
msgs = mailbox.mbox('../data/interim/pulled.mbox')

os.remove(name)
     

https://lists.fedoraproject.org/archives/list/users@lists.fedoraproject.org/export/users@lists.fedoraproject.org.mbox.gz?start=2006-06-01&end=2006-09-01


EOFError: Compressed file ended before the end-of-stream marker was reached

In [85]:
#preforming function on mbox 

mbox_info = build_list(msgs)
data = pandas_clean(mbox_info)

#updated this to save to bucket with naming conventions depending on the mbox that are retrieved
#updated_email.to_csv('/opt/app-root/src/data/user_clean2.csv')

In [86]:
data.to_csv('../data/interim/temp_clean.csv')
s3_location = f_list + "/" + start_year + "_" + start_month + "-" + finish_year + "_"+  finish_month + ".csv"
s3.upload_file(Filename='../data/interim/temp_clean.csv',Bucket=s3_bucket_name, Key=s3_location)

In [60]:
data

Unnamed: 0,Body,Date,From,Subject,Message ID,In-Reply,datetime
0,"This is roughly 45 to 75 C. 45C is fine, 75 is...","Sat, 31 Jan 2004 18:27",Stephen Walton <stephen.walton at csun.edu>,Re: CPU Temp under lm sensors,<1075602464.15697.3.camel@dhcppc1>,1075595900.6869.0.camel@insomnia.lmig.com,2004-01-31 18:27:00
1,Whats a safe temp for a CPU to run at? My insp...,"Sat, 31 Jan 2004 19:38",Troy Campano <troycampano at yahoo.com>,Re: CPU Temp under lm sensors,<1075595900.6869.0.camel@insomnia.lmig.com>,401B3766.2090506@csun.edu,2004-01-31 19:38:00
2,Could be your sensors.conf file needs some twe...,"Sat, 31 Jan 2004 20:50",Randy Kelsoe <randykel at swbell.net>,Re: CPU Temp under lm sensors {Scanned},<401C696F.20003@swbell.net>,1075602464.15697.3.camel@dhcppc1,2004-01-31 20:50:00
3,Anyone else noticed that this rsync mirror isn...,"Sat, 31 Jan 2004 21:17",Mike Chambers <mike at netlyncs.com>,dulug.duke.edu mirror,<1075605454.3081.13.camel@bart.netlyncs.com>,,2004-01-31 21:17:00
4,Did you do a yumarch on the directories in you...,"Sat, 31 Jan 2004 21:47",Randy Kelsoe <randykel at swbell.net>,Re: Yum and xmms problem {Scanned},<401C76D0.7010505@swbell.net>,401C6FD5.6040903@margo.bijoux.nom.br,2004-01-31 21:47:00
...,...,...,...,...,...,...,...
84298,"Hi, I want to know if a project like this exis...","Mon, 31 Jan 2005 23:22",bsebastien at bluewin.ch,System Summary for Webpage,<41FEAFA7.808@bluewin.ch>,,2005-01-31 23:22:00
84299,"Hi, I want to know if a project like this exi...","Mon, 31 Jan 2005 23:52",bsebastien at bluewin.ch,Re: System Summary for Webpage,<41FEB6A7.6010408@bluewin.ch>,,2005-01-31 23:52:00
84300,i have changed the file to enable Per user Web...,"Tue, 01 Feb 2005 00:51",Prudhvi Krishna Surapaneni <prudhvikrishna at ...,FC3 :Apache Problem,<ac3b0f10501311121549f3847@mail.gmail.com>,,2005-02-01 00:51:00
84301,p: Are you using nscd? Try wiping out its cach...,"Tue, 01 Feb 2005 09:00",Dan <ml at mutox.org>,Re: Problems with ldap on Fedora Core 3,<1107212431.10494.58.camel@devel2.x32.com.au>,41FE8CEF.7020900@virtc.com,2005-02-01 09:00:00


In [31]:
data.iloc[400]['Date']

'Mon, 16 Oct 2017 09:30'

In [48]:
os.remove(name)

In [64]:
data

Unnamed: 0,Body,Date,From,Subject,Message ID,In-Reply,datetime
0,Redistribution of flashplugin is completely il...,"Sat, 31 Jan 2004 17:00",Warren Togami <warren at togami.com>,Re: mplayer vs. xine,<401C6BC1.8040600@togami.com>,20040131152547.GC5507@thyrsus.com,2004-01-31 17:00:00
1,I am guessing what people are looking for is s...,"Sat, 31 Jan 2004 19:15",Stephen Smoogen <smoogen at lanl.gov>,Re: Upgrade of unmaintained packages,<Pine.LNX.4.58.0401311913570.32075@rhel3dev.ds...,1075478191.25735.14.camel@chip.laiskiainen.org,2004-01-31 19:15:00
2,"Wow. Sweet. As an FYI, 1.5.3 not only fixes t...","Sat, 31 Jan 2004 20:19",Paul Iadonisi <pri.rhl1 at iadonisi.to>,Evolution 1.5.3 (was: Re: Evolution 1.5.2),<1075598343.2711.5.camel@va.local.linuxlobbyis...,1074557012.14797.11.camel@va.local.linuxlobbyi...,2004-01-31 20:19:00
3,I just did the upgrade and found that there wa...,"Sat, 31 Jan 2004 21:46",Jim Cornette <cornette at insight.rr.com>,"Menus back, but where is...",<401C6894.4020601@insight.rr.com>,,2004-01-31 21:46:00
4,This is now correct behaviour. Nautilus is no ...,"Sat, 31 Jan 2004 21:54",Gerald Henriksen <ghenriks at rogers.com>,Re: Nautilus toolbars,<tgqo10t2n2840lpc3bm9f0sld73303trrn@4ax.com>,1075592741.3699.3.camel@aurora.localdomain,2004-01-31 21:54:00
...,...,...,...,...,...,...,...
15872,"The ""problem"" I have about changelog is its du...","Tue, 01 Feb 2005 00:02",=?utf-8?q?F=C3=A9liciano_Matias_=3Cfeliciano=2...,Re: radical suggestion for fc4 release,<1107212547.17531.10.camel@one.myworld>,1107209076.5291.26.camel@opus.phy.duke.edu,2005-02-01 00:02:00
15873,Hopefully not :). Joe is about the only useful...,"Tue, 01 Feb 2005 00:22",Pekka Savola <pekkas at netcore.fi>,Re: Volunteers? was Re: further package remova...,<Pine.LNX.4.61.0502010020130.26769@netcore.fi>,Pine.LNX.4.58.0501261047070.29773@devserv.deve...,2005-02-01 00:22:00
15874,"first. As they are stored compressed, it is m...","Tue, 01 Feb 2005 00:28",Enrico Scholz <enrico.scholz at informatik.tu-...,Re: radical suggestion for fc4 release,<87mzupch6k.fsf@kosh.ultra.csn.tu-chemnitz.de>,1107212145.30653.174.camel@shahms.mesd.k12.or.us,2005-02-01 00:28:00
15875,e first. For sake of completeness: $ rpm qa ch...,"Tue, 01 Feb 2005 00:59",Ziga Mahkovec <ziga.mahkovec at klika.si>,Re: radical suggestion for fc4 release,<1107215993.14233.5.camel@serenity.klika.si>,1107212145.30653.174.camel@shahms.mesd.k12.or.us,2005-02-01 00:59:00


In [82]:
gunzip(name,'../data/interim/pulled.mbox' )

EOFError: Compressed file ended before the end-of-stream marker was reached