Skip to content
Browse files

First commit

  • Loading branch information...
0 parents commit 105ebac909d9cab8d5c0d7095c40b3ed6da612a2 @alexksikes committed
Showing with 894 additions and 0 deletions.
  1. +4 −0 .gitignore
  2. +33 −0 README
  3. +16 −0 autocraig.conf
  4. 0 autocraig.duplicates
  5. +246 −0 autocraig.py
  6. +413 −0 html2text.py
  7. +142 −0 send_mail.py
  8. +40 −0 text_similarity.py
4 .gitignore
@@ -0,0 +1,4 @@
+*.pyc
+*.so
+*~
+_autocraig.conf
33 README
@@ -0,0 +1,33 @@
+This program scrapes a specified section of Craigslist and automatically email all the authors of each post with a custom email message.
+
+I wrote this program back in the summer of 2007 in order to look for a place in Newport Beach. The idea was to have people email me instead of the other way around and to make sure I would most likely be the first person on the deal.
+
+Example:
+
+$url:
+http://orangecounty.craigslist.org/search/hhh?query=superior+newport+beach&minAsk=min&maxAsk=1350
+
+$msg:
+Hi,
+
+I just looked at your post on Craigslist and I'm very interested in the room. Would it be possible to have a look a it ASAP?
+
+Thank you so much,
+
+Alex Ksikes
+
+and run it as:
+python --report --auto $msg $url
+
+This will email all the authors of each post scraped at the specified URL (--auto option). Here I'm looking for a room on superior in Newport Beach with maximum rent of $1350. The result of the scrapping is also sent to me in one email (--report option) for further scrutiny.
+
+Email parameters are set in autocraig.conf. Duplicates are kept in autocraig.duplicates to make sure each author is emailed only once. The program can scrape deep to a chosen number of days or pages. Best used as a cron job. Use it with caution to avoid SPAM! (consider removing the --auto option)
+
+I set it up on the following URLs as a cron job (and sporadically used the --auto option):
+
+http://orangecounty.craigslist.org/search/hhh?query=superior+newport+beach&minAsk=min&maxAsk=1350
+http://orangecounty.craigslist.org/search/hhh?query=hoag+newport+beach&minAsk=min&maxAsk=1350
+http://orangecounty.craigslist.org/search/hhh?query=ticonderoga&minAsk=min&maxAsk=1350
+http://orangecounty.craigslist.org/search/hhh?query=newport+crest&minAsk=min&maxAsk=1350
+
+The irony of the story: I ended up renting the place accross the street!!
16 autocraig.conf
@@ -0,0 +1,16 @@
+### email options
+FROM_EMAIL = me@example.com
+REPLY_EMAIL = reply@example.com
+CC_EMAIL = cc@example.com
+
+### number of days deep
+NUM_DAYS = 3
+NUM_PAGES = 3
+
+### duplicates file path
+DUPLICATES = autocraig.duplicates
+VALID = 15
+SIMILARITY = 0.9
+
+### report
+TO_EMAIL = me@example.com
0 autocraig.duplicates
No changes.
246 autocraig.py
@@ -0,0 +1,246 @@
+# Author : Alex Ksikes
+
+# requires
+# - send_mail.py
+# - html2text.py
+
+# TODO:
+# - faster page retrieval (concurrent dnl)
+# - test auto email
+# - test duplicates by bag of words
+# - it'd be nicer to specify some set of keywords (no urls)
+# then create one big page for all the queries
+
+import re, urllib
+
+def autocraig(search_url, auto=False, report=False,
+ ignore_duplicates=False, quiet=False,
+ duplicates_file=None):
+ # read config file
+ read_conf('autocraig.conf')
+ # set the duplicate file
+ if not duplicates_file:
+ duplicates_file = conf['DUPLICATES']
+ # load duplicates
+ duplicates = {}
+ if not ignore_duplicates:
+ duplicates = load_duplicates(duplicates_file)
+ # get all posts
+ posts = get_all_posts(search_url, duplicates)
+ # add to duplicates
+ add_to_duplicates(duplicates_file, posts)
+ # report an hrml if needed
+ if report:
+ email_digest(posts)
+ # auto email all autors
+ if auto:
+ email_authors(posts, auto)
+ # output result to stdout
+ if not quiet:
+ print rep(posts),
+
+conf = {
+ 'FROM_EMAIL' : '',
+ 'REPLY_EMAIL' : '',
+ 'CC_EMAIL' : '',
+ 'NUM_DAYS' : 3,
+ 'NUM_PAGES' : 3,
+ 'TO_EMAIL' : '',
+ 'DEEP' : 1,
+ 'DUPLICATES' : 'autocraig.duplicates',
+ 'VALID' : 15,
+ 'SIMILARITY' : 0.9}
+def read_conf(conf_file):
+ global conf
+ for l in open(conf_file):
+ if re.match('^#|\s+[#\s]*', l):
+ continue
+ (k, v) = map(lambda s: s.strip(), l.split('='))
+ conf[k.upper()] = v
+
+d_sep = '@#@#@'
+def load_duplicates(duplicate_file):
+ duplicates = {}
+ data = open(duplicate_file).read().split(d_sep)
+ for (craig_id, text) in zip(data[0::2], data[1::2]):
+ duplicates[craig_id] = text
+ return duplicates
+
+import urlparse
+def get_all_posts(search_url, duplicates, deep=conf['DEEP']):
+ posts = []
+ #for i in range(deep):
+ # search_url += '&s=%s' % i * 100
+ for u in get_post_urls(search_url):
+ post = get_post(urlparse.urljoin(search_url, u))
+ if not duplicates or not is_duplicates(duplicates, post):
+ posts.append(post)
+ return posts
+
+#p_urls = re.compile('<p>&nbsp;.*?&nbsp;&nbsp;&nbsp;<a href="(.*?)">.*?</a>', re.I)
+p_urls = re.compile('<p>.*?\-.*?<a href="(.*?)">.*?</a>', re.I)
+def get_post_urls(search_url):
+ html = urllib.urlopen(search_url).read()
+ return p_urls.findall(html)
+
+import html2text
+p_post = {'reply' : re.compile('mailto:(.*?)\?', re.I),
+ 'description_html' : re.compile('(<h2>.*?postingid:\s[0-9]+)<br>', re.I|re.S)}
+def get_post(post_url):
+ post, html = {}, urllib.urlopen(post_url).read()
+ post['url'] = post_url
+ post['craig_id'] = re.findall('/([0-9]+)\.html', post_url)[0]
+ for type, p in p_post.items():
+ txt = p.findall(html)
+ if txt:
+ txt = txt[0]
+ else:
+ txt = ''
+ post[type] = txt
+ try:
+ post['description_text'] = html2text.html2text(post['description_html']).encode('utf-8')
+ except:
+ post['description_text'] = ''
+ post['phone'], post['email_alternative'] = analyze(post['description_text'])
+ return post
+
+def is_duplicates(duplicates, post):
+ #print post['craig_id']
+ if duplicates.has_key(post['craig_id']):
+ return True
+ for text in duplicates.values():
+ if dot(text, post['description_text']) >= conf['SIMILARITY']:
+ return True
+ return False
+
+def get_bag(s):
+ v = {}
+ for w in s.split():
+ v[w] = v.get(w, 0) + 1
+ return v
+
+def dot(s1, s2):
+ v1, v2 = get_bag(s1), get_bag(s2)
+ score = 0
+ for w, val in v1.items():
+ if v2.has_key(w):
+ score += v2[w] * val
+ norm = max(len(s1.split()), len(s2.split()))
+ if norm == 0:
+ norm = 1
+ score = 0
+ return 1.0 * score / norm
+
+def add_to_duplicates(duplicates_file, posts):
+ o = open(duplicates_file, 'a')
+ for post in posts:
+ o.write(post['craig_id'] + d_sep + post['description_text'] + d_sep)
+ o.close()
+
+# from dive into python
+phonePattern = re.compile(r'''
+ # don't match beginning of string, number can start anywhere
+ (\d{3}) # area code is 3 digits (e.g. '800')
+ \D* # optional separator is any number of non-digits
+ (\d{3}) # trunk is 3 digits (e.g. '555')
+ \D* # optional separator
+ (\d{4}) # rest of number is 4 digits (e.g. '1212')
+ \D* # optional separator
+ (\d*) # extension is optional and can be any number of digits
+ $ # end of string
+ ''', re.I)
+# from aspn cookbook
+mailPattern = re.compile(r'''
+ [\w\-][\w\-\.]*@[\w\-][\w\-\.]+[a-zA-Z]{1,4}
+ ''', re.I)
+def analyze(description_text):
+ phone = phonePattern.findall(description_text)
+ if not phone:
+ phone = ['']
+ email = mailPattern.findall(description_text)
+ if not email:
+ email = ['']
+ return (phone[0], email[0])
+
+from send_mail import send_mail
+import datetime
+def email_digest(posts):
+ if posts:
+ send_mail(to_addrs=conf['TO_EMAIL'], message=rep(posts, html=True), from_addr=conf['FROM_EMAIL'],
+ content_type='text/html', subject='craigslist-auto-%s' % datetime.datetime.now())
+
+def email_authors(posts, msg):
+ for post in posts:
+ send_mail(to_addrs=post['reply'], from_addrs=conf['FROM_EMAIL'],
+ cc_addrs=conf['CC_EMAIL'], message=msg, subject=post['title'])
+
+def rep(posts, html=False):
+ s = ''
+ for post in posts:
+ if html:
+ info = '<a href="%s">source</a>' % post['url']
+ sep = '<hr>\n'
+ desc = 'description_html'
+ else:
+ info = 'source : ' + post['url']
+ sep = 50 * '#' + '\n'
+ desc = 'description_text'
+ info += post['phone'] + post['email_alternative']
+ s += sep + info + post[desc] + '\n'
+ return s[:-1]
+
+def usage():
+ print "Usage: python autocraig.py [options] search_url"
+ print
+ print "Description:"
+ print "Scrape craigslist posts for a section specified by the search url."
+ print "Print to stdout all the posts that have been scraped."
+ print "Auto email the author's of each post."
+ print "All config options are specified in cwd/autocraig.conf."
+ print "Lists of duplicates are kept in cwd/autocraig.duplicates."
+ print
+ print "Options:"
+ print "--auto msg_file: email all authors wiht msg_file or use - to read from stdin"
+ print "--report : send digest html email with pictures and summary"
+ print "--ignore-duplicates : ignore the duplicate detection facility"
+ print "--duplicate-file file : use another duplicate file"
+ print "--quiet : do not show the emailed posts and summary"
+ print "--help : this help message"
+ print
+ print "Email bugs/suggestions to Alex Ksikes (alex.ksikes@gmail.com)"
+
+import sys, getopt
+def main():
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "",
+ ["auto", "report", "ignore-duplicates", "quiet", "duplicate-file=", "help"])
+ except getopt.GetoptError:
+ usage()
+ sys.exit(2)
+ auto = report = ignore_duplicates = quiet = False
+ duplicate_file = None
+ for o, a in opts:
+ if o == "--auto":
+ auto = a
+ if a == '-':
+ auto = sys.stdin.read()
+ elif o == "--report":
+ report = True
+ elif o == "--ignore-duplicates":
+ ignore_duplicates = True
+ elif o == "--quiet":
+ quiet = True
+ elif o == "--duplicate-file":
+ duplicate_file = a
+ elif o in ("-h", "--help"):
+ usage()
+ sys.exit()
+ if len(args) < 1:
+ usage()
+ else:
+ autocraig(args[-1], auto=auto, report=report,
+ ignore_duplicates=ignore_duplicates, quiet=quiet,
+ duplicates_file=duplicate_file)
+
+if __name__ == '__main__':
+ main()
413 html2text.py
@@ -0,0 +1,413 @@
+"""html2text: Turn HTML into equivalent Markdown-structured text."""
+__version__ = "2.28"
+__author__ = "Aaron Swartz (me@aaronsw.com)"
+__copyright__ = "(C) 2004-2007 Aaron Swartz. GNU GPL 2."
+__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes"]
+
+# TODO:
+# Support decoded entities with unifiable.
+# Relative URL resolution
+
+if not hasattr(__builtins__, 'True'): True, False = 1, 0
+import re, sys, urllib, htmlentitydefs, codecs, StringIO, types
+import sgmllib
+sgmllib.charref = re.compile('&#([xX]?[0-9a-fA-F]+)[^0-9a-fA-F]')
+
+try: from textwrap import wrap
+except: pass
+
+# Use Unicode characters instead of their ascii psuedo-replacements
+UNICODE_SNOB = 0
+
+# Put the links after each paragraph instead of at the end.
+LINKS_EACH_PARAGRAPH = 0
+
+# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
+BODY_WIDTH = 0
+
+### Entity Nonsense ###
+
+def name2cp(k):
+ if k == 'apos': return ord("'")
+ if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
+ return htmlentitydefs.name2codepoint[k]
+ else:
+ k = htmlentitydefs.entitydefs[k]
+ if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
+ return ord(codecs.latin_1_decode(k)[0])
+
+unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
+'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
+'ndash':'-', 'oelig':'oe', 'aelig':'ae',
+'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
+'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
+'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
+'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
+'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u'}
+
+unifiable_n = {}
+
+for k in unifiable.keys():
+ unifiable_n[name2cp(k)] = unifiable[k]
+
+def charref(name):
+ if name[0] in ['x','X']:
+ c = int(name[1:], 16)
+ else:
+ c = int(name)
+
+ if not UNICODE_SNOB and c in unifiable_n.keys():
+ return unifiable_n[c]
+ else:
+ return unichr(c)
+
+def entityref(c):
+ if not UNICODE_SNOB and c in unifiable.keys():
+ return unifiable[c]
+ else:
+ try: name2cp(c)
+ except KeyError: return "&" + c
+ else: return unichr(name2cp(c))
+
+def replaceEntities(s):
+ s = s.group(1)
+ if s[0] == "#":
+ return charref(s[1:])
+ else: return entityref(s)
+
+r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
+def unescape(s):
+ return r_unescape.sub(replaceEntities, s)
+
+def fixattrs(attrs):
+ # Fix bug in sgmllib.py
+ if not attrs: return attrs
+ newattrs = []
+ for attr in attrs:
+ newattrs.append((attr[0], unescape(attr[1])))
+ return newattrs
+
+### End Entity Nonsense ###
+
+def onlywhite(line):
+ """Return true if the line does only consist of whitespace characters."""
+ for c in line:
+ if c is not ' ' and c is not ' ':
+ return c is ' '
+ return line
+
+def optwrap(text):
+ """Wrap all paragraphs in the provided text."""
+ if not BODY_WIDTH:
+ return text
+
+ assert wrap # Requires Python 2.3.
+ result = ''
+ newlines = 0
+ for para in text.split("\n"):
+ if len(para) > 0:
+ if para[0] is not ' ' and para[0] is not '-' and para[0] is not '*':
+ for line in wrap(para, BODY_WIDTH):
+ result += line + "\n"
+ result += "\n"
+ newlines = 2
+ else:
+ if not onlywhite(para):
+ result += para + "\n"
+ newlines = 1
+ else:
+ if newlines < 2:
+ result += "\n"
+ newlines += 1
+ return result
+
+def hn(tag):
+ if tag[0] == 'h' and len(tag) == 2:
+ try:
+ n = int(tag[1])
+ if n in range(1, 10): return n
+ except ValueError: return 0
+
+class _html2text(sgmllib.SGMLParser):
+ def __init__(self, out=sys.stdout.write):
+ sgmllib.SGMLParser.__init__(self)
+
+ if out is None: self.out = self.outtextf
+ else: self.out = out
+ self.outtext = u''
+ self.quiet = 0
+ self.p_p = 0
+ self.outcount = 0
+ self.start = 1
+ self.space = 0
+ self.a = []
+ self.astack = []
+ self.acount = 0
+ self.list = []
+ self.blockquote = 0
+ self.pre = 0
+ self.startpre = 0
+ self.lastWasNL = 0
+
+ def outtextf(self, s):
+ if type(s) is type(''): s = codecs.utf_8_decode(s)[0]
+ self.outtext += s
+
+ def close(self):
+ sgmllib.SGMLParser.close(self)
+
+ self.pbr()
+ self.o('', 0, 'end')
+
+ return self.outtext
+
+ def handle_charref(self, c):
+ self.o(charref(c))
+
+ def handle_entityref(self, c):
+ self.o(entityref(c))
+
+ def unknown_starttag(self, tag, attrs):
+ self.handle_tag(tag, attrs, 1)
+
+ def unknown_endtag(self, tag):
+ self.handle_tag(tag, None, 0)
+
+ def previousIndex(self, attrs):
+ """ returns the index of certain set of attributes (of a link) in the
+ self.a list
+
+ If the set of attributes is not found, returns None
+ """
+ if not attrs.has_key('href'): return None
+
+ i = -1
+ for a in self.a:
+ i += 1
+ match = 0
+
+ if a.has_key('href') and a['href'] == attrs['href']:
+ if a.has_key('title') or attrs.has_key('title'):
+ if (a.has_key('title') and attrs.has_key('title') and
+ a['title'] == attrs['title']):
+ match = True
+ else:
+ match = True
+
+ if match: return i
+
+ def handle_tag(self, tag, attrs, start):
+ attrs = fixattrs(attrs)
+
+ if hn(tag):
+ self.p()
+ if start: self.o(hn(tag)*"#" + ' ')
+
+ if tag in ['p', 'div']: self.p()
+
+ if tag == "br" and start: self.o(" \n")
+
+ if tag == "hr" and start:
+ self.p()
+ self.o("* * *")
+ self.p()
+
+ if tag in ["head", "style", 'script']:
+ if start: self.quiet += 1
+ else: self.quiet -= 1
+
+ if tag == "blockquote":
+ if start:
+ self.p(); self.o('> ', 0, 1); self.start = 1
+ self.blockquote += 1
+ else:
+ self.blockquote -= 1
+ self.p()
+
+ if tag in ['em', 'i', 'u']: self.o("_")
+ if tag in ['strong', 'b']: self.o("**")
+ if tag == "code" and not self.pre: self.o('`') #TODO: `` `this` ``
+
+ if tag == "a":
+ if start:
+ attrsD = {}
+ for (x, y) in attrs: attrsD[x] = y
+ attrs = attrsD
+ if attrs.has_key('href'):
+ self.astack.append(attrs)
+ self.o("[")
+ else:
+ self.astack.append(None)
+ else:
+ if self.astack:
+ a = self.astack.pop()
+ if a:
+ i = self.previousIndex(a)
+ if i is not None:
+ a = self.a[i]
+ else:
+ self.acount += 1
+ a['count'] = self.acount
+ a['outcount'] = self.outcount
+ self.a.append(a)
+ self.o("][" + `a['count']` + "]")
+
+ if tag == "img" and start:
+ attrsD = {}
+ for (x, y) in attrs: attrsD[x] = y
+ attrs = attrsD
+ if attrs.has_key('src'):
+ attrs['href'] = attrs['src']
+ alt = attrs.get('alt', '')
+ i = self.previousIndex(attrs)
+ if i is not None:
+ attrs = self.a[i]
+ else:
+ self.acount += 1
+ attrs['count'] = self.acount
+ attrs['outcount'] = self.outcount
+ self.a.append(attrs)
+ self.o("![")
+ self.o(alt)
+ self.o("]["+`attrs['count']`+"]")
+
+ if tag == 'dl' and start: self.p()
+ if tag == 'dt' and not start: self.pbr()
+ if tag == 'dd' and start: self.o(' ')
+ if tag == 'dd' and not start: self.pbr()
+
+ if tag in ["ol", "ul"]:
+ if start:
+ self.list.append({'name':tag, 'num':0})
+ else:
+ if self.list: self.list.pop()
+
+ self.p()
+
+ if tag == 'li':
+ if start:
+ self.pbr()
+ if self.list: li = self.list[-1]
+ else: li = {'name':'ul', 'num':0}
+ self.o(" "*len(self.list)) #TODO: line up <ol><li>s > 9 correctly.
+ if li['name'] == "ul": self.o("* ")
+ elif li['name'] == "ol":
+ li['num'] += 1
+ self.o(`li['num']`+". ")
+ self.start = 1
+ else:
+ self.pbr()
+
+ if tag in ["table", "tr"] and start: self.p()
+ if tag == 'td': self.pbr()
+
+ if tag == "pre":
+ if start:
+ self.startpre = 1
+ self.pre = 1
+ else:
+ self.pre = 0
+ self.p()
+
+ def pbr(self):
+ if self.p_p == 0: self.p_p = 1
+
+ def p(self): self.p_p = 2
+
+ def o(self, data, puredata=0, force=0):
+ if not self.quiet:
+ if puredata and not self.pre:
+ data = re.sub('\s+', ' ', data)
+ if data and data[0] == ' ':
+ self.space = 1
+ data = data[1:]
+ if not data and not force: return
+
+ if self.startpre:
+ #self.out(" :") #TODO: not output when already one there
+ self.startpre = 0
+
+ bq = (">" * self.blockquote)
+ if not (force and data and data[0] == ">") and self.blockquote: bq += " "
+
+ if self.pre:
+ bq += " "
+ data = data.replace("\n", "\n"+bq)
+
+ if self.start:
+ self.space = 0
+ self.p_p = 0
+ self.start = 0
+
+ if force == 'end':
+ # It's the end.
+ self.p_p = 0
+ self.out("\n")
+ self.space = 0
+
+
+ if self.p_p:
+ self.out(('\n'+bq)*self.p_p)
+ self.space = 0
+
+ if self.space:
+ if not self.lastWasNL: self.out(' ')
+ self.space = 0
+
+ if self.a and ((self.p_p == 2 and LINKS_EACH_PARAGRAPH) or force == "end"):
+ if force == "end": self.out("\n")
+
+ newa = []
+ for link in self.a:
+ if self.outcount > link['outcount']:
+ self.out(" ["+`link['count']`+"]: " + link['href']) #TODO: base href
+ if link.has_key('title'): self.out(" ("+link['title']+")")
+ self.out("\n")
+ else:
+ newa.append(link)
+
+ if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
+
+ self.a = newa
+
+ self.p_p = 0
+ self.out(data)
+ self.lastWasNL = data and data[-1] == '\n'
+ self.outcount += 1
+
+ def handle_data(self, data):
+ self.o(data, 1)
+
+ def unknown_decl(self, data): pass
+
+def wrapwrite(text): sys.stdout.write(text.encode('utf8'))
+
+def html2text_file(html, out=wrapwrite):
+ h = _html2text(out)
+ h.feed(html)
+ h.feed("")
+ return h.close()
+
+def html2text(html):
+ return optwrap(html2text_file(html, None))
+
+if __name__ == "__main__":
+ if sys.argv[1:]:
+ arg = sys.argv[1]
+ if arg.startswith('http://'):
+ j = urllib.urlopen(arg)
+ try:
+ from feedparser import _getCharacterEncoding as enc
+ except ImportError:
+ enc = lambda x, y: y, x
+ text = j.read()
+ encoding = enc(j.headers, text)[0]
+ if encoding == 'us-ascii': encoding = 'utf-8'
+ data = html2text_file(text.decode(encoding))
+
+ else:
+ data = open(arg, 'r').read()
+ else:
+ data = sys.stdin.read()
+ html2text_file(data)
+
142 send_mail.py
@@ -0,0 +1,142 @@
+# Author : Alex Ksikes
+# Parts from Python Network Programming
+
+# requires:
+# - a configured smtp sever
+
+from email.MIMEText import MIMEText
+from email.MIMEMultipart import MIMEMultipart
+from email.MIMEBase import MIMEBase
+from email import Utils, Encoders
+import mimetypes, smtplib, sys
+
+def genpart(data, contentype):
+ maintype, subtype = contentype.split('/')
+ if maintype == 'text':
+ retval = MIMEText(data, _subtype=subtype)
+ else:
+ retval = MIMEBase(maintype, subtype)
+ retval.set_payload(data)
+ Encoders.encode_base64(retval)
+ return retval
+
+def attachment(filename):
+ fd = open(filename, 'rb')
+ mimetype, mimeencoding = mimetypes.guess_type(filename)
+ if mimeencoding or (mimetype is None):
+ mimetype = 'application/octet-stream'
+ retval = genpart(fd.read(), mimetype)
+ retval.add_header('Content-Disposition', 'attachment', filename=filename)
+ fd.close()
+ return retval
+
+class Mailer:
+ def __init__(self, server='localhost', verbose=False):
+ self.server = server
+ self.verbose = verbose
+
+ def compose(self, to_addrs=[], from_addr='', subject='', message='', cc_addrs=[],
+ bcc_addrs=[], content_type='text/plain', attachments=[]):
+
+ self.subject = subject
+ self.to_addrs = to_addrs
+ self.from_addr = from_addr
+
+ if not attachments and content_type == 'text/plain':
+ msg = MIMEText(message)
+ else:
+ msg = MIMEMultipart()
+
+ # should be refactored
+ msg['To'] = to_addrs[0] # to be changed to handle mutiple to
+ msg['From'] = from_addr
+ if cc_addrs:
+ msg['Cc'] = cc_addrs[0] # to be changed to handle mutiple cc
+ msg['Subject'] = subject
+ msg['Date'] = Utils.formatdate(localtime=1)
+ msg['Message-ID'] = Utils.make_msgid()
+
+ if content_type != 'text/plain':
+ body = MIMEMultipart('alternative')
+ body.attach(genpart(message, content_type))
+ msg.attach(body)
+
+ for a in attachments:
+ msg.attach(attachment(a))
+
+ self.msg = msg.as_string()
+
+ def send(self):
+ s = smtplib.SMTP(self.server)
+ if self.verbose:
+ s.set_debuglevel(1)
+ s.sendmail(self.from_addr, self.to_addrs, self.msg)
+ s.close()
+ if self.verbose:
+ print "Message successfully sent %d recipient(s)" % len(self.to_addrs)
+
+def send_mail(to_addrs=[], from_addr='', subject='', message='', cc_addrs=[],
+ bcc_addrs=[], content_type='text/plain', attachments=[], verbose=False):
+ m = Mailer(verbose=verbose)
+ m.compose(to_addrs, from_addr, subject, message, cc_addrs, bcc_addrs, content_type, attachments)
+ m.send()
+
+def usage():
+ print "Usage: python mail.py [options] message_file"
+ print
+ print "Description:"
+ print "Sends an email using python email library"
+ print
+ print "Options:"
+ print "-m, --mail : email_1@hostname,...,email_n@hostname"
+ print "-c, --content-type: content type"
+ print "-a, --attachments : files to be attached"
+ print "-s, --subject : subject of the email"
+ print "-f, --from : from of the email"
+ print "-r, --reply : and reply of the email sent"
+ print "--cc : carbon copy"
+ print "-v, --verbose : show all"
+ print
+ print "Email bugs/suggestions to Alex Ksikes (alex.ksikes@gmail.com)"
+
+import sys, getopt
+def main():
+ try:
+ opts, args = getopt.getopt(sys.argv[1:], "m:c:a:s:f:r:vh",
+ ["mail=", "content-type=", "attachments=",
+ "subject=", "from=", "reply=", "cc=",
+ "verbose", "help"])
+ except getopt.GetoptError:
+ usage()
+ sys.exit(2)
+
+ content_type = 'text/plain'
+ to_addrs, cc_addrs, attachments = [], [], []
+ subject = from_addr = reply = ''
+ verbose = False
+ for o, a in opts:
+ if o in ("-m", "--mail"):
+ to_addrs = a.split(',')
+ elif o in ("-c", "--content-type"):
+ content_type = a
+ elif o in ("-a", "--attachments"):
+ attachments = a.split()
+ elif o in ("-s", "--subject"):
+ subject = a
+ elif o in ("-f", "--from"):
+ from_addr = a
+ elif o in ("--cc"):
+ cc_addrs = a.split(',')
+ elif o in ("-v", "--verbose"):
+ verbose = True
+ elif o in ("-h", "--help"):
+ usage()
+ sys.exit()
+ if len(args) < 1:
+ usage()
+ else:
+ send_mail(to_addrs=to_addrs, from_addr=from_addr, subject=subject, message=open(sys.argv[-1]).read(),
+ cc_addrs=cc_addrs, content_type=content_type, attachments=attachments, verbose=verbose)
+
+if __name__ == '__main__':
+ main()
40 text_similarity.py
@@ -0,0 +1,40 @@
+# Author : Alex Ksikes
+
+def vector(txt):
+ v = {}
+ tokens = txt.split()
+ for t in tokens:
+ if v.has_key(t):
+ v[t] +=1
+ else:
+ v[t] = 1
+ return v
+
+def dot(v1, v2):
+ res = 0
+ for a in v1:
+ if v2.has_key(a):
+ res += v1[a] * v2[a]
+ return res
+
+def score(txt1, txt2):
+ return 1.0 * dot(vector(txt1), vector(txt2)) / max(len(txt1.split()), len(txt2.split()))
+
+def most_similar(txt1, txt_list):
+ v1 = vector(txt1)
+ best_txt = ""
+ best_score = -1
+ for txt in txt_list:
+ v2 = vector(txt)
+ score = dot(v1, v2)
+ if score > best_score:
+ best_score = score
+ best_txt = txt
+ return best_txt
+
+import sys
+if __name__ == '__main__':
+ if len(sys.argv) == 1:
+ print 'Usage: python text_similarity.py text1 text2'
+ else:
+ print score(sys.argv[1], sys.argv[2])

0 comments on commit 105ebac

Please sign in to comment.
Something went wrong with that request. Please try again.